diff --git a/.gitignore b/.gitignore
index 82f4286297..89ef98427f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,3 +41,4 @@ storybook-static
 envs/qwerty.env
 .parcel-cache
 **/.DS_Store
+cpu-cycles/libcpucycles/build
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index 662c55f585..e168a3d9b8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -886,6 +886,14 @@ dependencies = [
  "uint",
 ]
 
+[[package]]
+name = "cpu-cycles"
+version = "0.1.0"
+dependencies = [
+ "cfg-if",
+ "libc",
+]
+
 [[package]]
 name = "cpufeatures"
 version = "0.2.5"
@@ -3495,6 +3503,7 @@ dependencies = [
  "bs58",
  "clap 4.1.11",
  "colored",
+ "cpu-cycles",
  "cupid",
  "dirs",
  "futures",
diff --git a/cpu-cycles/Cargo.toml b/cpu-cycles/Cargo.toml
new file mode 100644
index 0000000000..59d2ed52b6
--- /dev/null
+++ b/cpu-cycles/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "cpu-cycles"
+version = "0.1.0"
+edition = "2021"
+build = "build.rs"
+links = "cpucycles"
+
+[dependencies]
+libc = "0.2.140"
+
+[build-dependencies]
+cfg-if = "1"
\ No newline at end of file
diff --git a/cpu-cycles/build.rs b/cpu-cycles/build.rs
new file mode 100644
index 0000000000..d86a4f6df3
--- /dev/null
+++ b/cpu-cycles/build.rs
@@ -0,0 +1,65 @@
+use std::{env, path::PathBuf, process::Command};
+
+fn main() {
+    let out_dir = env::var("OUT_DIR").unwrap();
+    let out_path = PathBuf::from(&out_dir);
+    let source_path = PathBuf::from("libcpucycles")
+        .canonicalize()
+        .expect("cannot canonicalize path");
+
+    cfg_if::cfg_if! {
+        if #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "mips", target_arch = "powerpc", target_arch = "powerpc64", target_arch = "arm")))] {
+            panic!("Unsupported architecture - {}!", env::var("CARGO_CFG_TARGET_ARCH").unwrap(), )
+        }
+    };
+
+    let mut compile_o_command = Command::new("./configure");
+    let compile_o_command = compile_o_command
+        .current_dir(&source_path)
+        .arg(format!("--prefix={out_dir}"));
+
+    match compile_o_command.output() {
+        Ok(output) => {
+            if !output.status.success() {
+                panic!("{:?}", unsafe {
+                    std::str::from_utf8_unchecked(&output.stderr)
+                })
+            }
+        }
+        Err(e) => panic!("{e}"),
+    }
+
+    let mut compile_o_command = Command::new("make");
+    let compile_o_command = compile_o_command.current_dir(&source_path).arg("install");
+
+    match compile_o_command.output() {
+        Ok(output) => {
+            if !output.status.success() {
+                panic!("{:?}", unsafe {
+                    std::str::from_utf8_unchecked(&output.stderr)
+                })
+            }
+        }
+        Err(e) => panic!("{e}"),
+    }
+
+    println!(
+        "cargo:rustc-link-search=native={}",
+        out_path.join("lib").to_str().unwrap()
+    );
+    println!("cargo:rustc-link-lib=static=cpucycles");
+
+    let mut compile_o_command = Command::new("make");
+    let compile_o_command = compile_o_command.current_dir(source_path).arg("clean");
+
+    match compile_o_command.output() {
+        Ok(output) => {
+            if !output.status.success() {
+                panic!("{:?}", unsafe {
+                    std::str::from_utf8_unchecked(&output.stderr)
+                })
+            }
+        }
+        Err(e) => panic!("{e}"),
+    }
+}
diff --git a/cpu-cycles/libcpucycles/Makefile b/cpu-cycles/libcpucycles/Makefile
new file mode 100644
index 0000000000..8968a9bf80
--- /dev/null
+++ b/cpu-cycles/libcpucycles/Makefile
@@ -0,0 +1,8 @@
+default:
+	cd build && $(MAKE)
+
+install:
+	cd build && $(MAKE) install
+
+clean:
+	cd build && $(MAKE) clean
diff --git a/cpu-cycles/libcpucycles/autogen/html b/cpu-cycles/libcpucycles/autogen/html
new file mode 100755
index 0000000000..84a0ff2dab
--- /dev/null
+++ b/cpu-cycles/libcpucycles/autogen/html
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+import os
+import datetime
+import markdown
+
+def load(fn):
+  with open(fn) as f:
+    return f.read()
+
+style = load('autogen/html-style')
+sitetitle = load('autogen/html-title')
+
+files = []
+
+with open('autogen/html-files') as f:
+  for line in f:
+    line = line.strip()
+    line = line.split(':')
+    if len(line) != 3: continue
+    files += [line]
+
+for md,html,pagetitle in files:
+  fnmd = 'doc/%s.md' % md
+  fnhtml = 'doc/html/%s.html' % html
+  output = ''
+
+  x = load(fnmd)
+  x = markdown.markdown(x,extensions=['markdown.extensions.extra','markdown.extensions.tables'])
+  mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(fnmd)).strftime('%Y.%m.%d')
+
+  output += '<html>\n<head>\n'
+  output += style
+  output += '<title>\n'
+  output += pagetitle
+  output += '</title>\n'
+  output += '</head>\n'
+  output += '<body>\n'
+
+  output += '<div class=headline>\n'
+  output += sitetitle
+  output += '</div>\n'
+
+  output += '<div class=nav>\n'
+  for submd,subhtml,subpagetitle in files:
+    if subhtml == html:
+      output += '<div class="navt here">'
+      output += pagetitle+'\n'
+    else:
+      output += '<div class="navt away">'
+      output += '<a href=%s.html>%s</a>\n' % (subhtml,subpagetitle)
+    output += '</div>'
+  output += '</div>\n'
+
+  output += '<div class=main>\n'
+  output += x
+  output += '<hr><font size=1><b>Version:</b>\n'
+  output += 'This is version %s of the "%s" web page.\n' % (mtime,pagetitle)
+  output += '</font>\n'
+  output += '</div>\n'
+
+  output += '</body>\n'
+  output += '</html>\n'
+
+  if not os.path.exists(fnhtml) or output != load(fnhtml):
+    with open(fnhtml+'.new','w') as f:
+      f.write(output)
+    os.chmod(fnhtml+'.new',0o444)
+    os.rename(fnhtml+'.new',fnhtml)
diff --git a/cpu-cycles/libcpucycles/autogen/html-files b/cpu-cycles/libcpucycles/autogen/html-files
new file mode 100644
index 0000000000..1dd6f3b34c
--- /dev/null
+++ b/cpu-cycles/libcpucycles/autogen/html-files
@@ -0,0 +1,7 @@
+readme:index:Intro
+download:download:Download
+install:install:Install
+api:api:API
+counters:counters:Counters
+selection:selection:Selection
+security:security:Security
diff --git a/cpu-cycles/libcpucycles/autogen/html-style b/cpu-cycles/libcpucycles/autogen/html-style
new file mode 100644
index 0000000000..7d189e11d0
--- /dev/null
+++ b/cpu-cycles/libcpucycles/autogen/html-style
@@ -0,0 +1,32 @@
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<style type="text/css">
+html{overflow-y:scroll}
+body{font-family:sans-serif}
+p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
+li p{font-size:1.0em}
+blockquote p{font-size:1.0em}
+tt{font-size:1.2em}
+code{font-size:1.2em}
+h1{font-size:1.5em}
+h2{font-size:1.3em}
+h3{font-size:1.0em}
+h1 a{text-decoration:none}
+table{border-collapse:collapse}
+th,td{border:1px solid black}
+table a{text-decoration:none}
+table tr{font-size:0.9em;line-height:1.6em}
+.links a:hover{text-decoration:underline}
+.links a:active{text-decoration:underline}
+.links img{width:200px;padding-left:1em}
+.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
+.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
+.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
+min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
+font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
+.here{border-bottom:0px;background-color:#ffffff}
+.away{background-color:#125d0d;}
+.away a{text-decoration:none;display:block;color:#ffffff}
+.away a:hover,.away a:active{text-decoration:underline}
+.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
+</style>
diff --git a/cpu-cycles/libcpucycles/autogen/html-title b/cpu-cycles/libcpucycles/autogen/html-title
new file mode 100644
index 0000000000..db7e3e2170
--- /dev/null
+++ b/cpu-cycles/libcpucycles/autogen/html-title
@@ -0,0 +1 @@
+libcpucycles
diff --git a/cpu-cycles/libcpucycles/autogen/man b/cpu-cycles/libcpucycles/autogen/man
new file mode 100755
index 0000000000..2ef7b9b507
--- /dev/null
+++ b/cpu-cycles/libcpucycles/autogen/man
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+pandoc --standalone --to man --metadata title=cpucycles --metadata section=3 < doc/api.md > doc/man/cpucycles.3
diff --git a/cpu-cycles/libcpucycles/command/cpucycles-info.c b/cpu-cycles/libcpucycles/command/cpucycles-info.c
new file mode 100644
index 0000000000..774fa178b1
--- /dev/null
+++ b/cpu-cycles/libcpucycles/command/cpucycles-info.c
@@ -0,0 +1,93 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <cpucycles.h>
+
+#define TIMINGS 63
+static long long t[TIMINGS+1];
+
+static void t_print(void)
+{
+  long long median = 0;
+  long long i,j;
+
+  for (i = 0;i < TIMINGS;++i)
+    t[i] = t[i+1]-t[i];
+  for (j = 0;j < TIMINGS;++j) {
+    long long belowj = 0;
+    long long abovej = 0;
+    for (i = 0;i < TIMINGS;++i) if (t[i] < t[j]) ++belowj;
+    for (i = 0;i < TIMINGS;++i) if (t[i] > t[j]) ++abovej;
+    if (belowj*2 < TIMINGS && abovej*2 < TIMINGS) {
+      median = t[j];
+      break;
+    }
+  }
+  printf(" %lld ",median);
+  for (i = 0;i < TIMINGS;++i)
+    printf("%+lld",t[i]-median);
+  printf("\n");
+  fflush(stdout);
+}
+
+static long long microseconds(void)
+{
+  struct timeval t;
+  long long result;
+  gettimeofday(&t,(struct timezone *) 0);
+  result = t.tv_sec;
+  result *= 1000000;
+  result += t.tv_usec;
+  return result;
+}
+
+static volatile int v;
+
+static void measure_cpucycles(void)
+{
+  long long loops,i,j;
+
+  printf("cpucycles persecond %lld\n",cpucycles_persecond());
+  printf("cpucycles implementation %s\n",cpucycles_implementation());
+
+  for (i = 0;i <= TIMINGS;++i)
+    t[i] = cpucycles();
+  printf("cpucycles median"); t_print();
+
+  for (loops = 1024;loops <= 1048576;loops *= 2) {
+    long long t00,t01,t10,t11;
+    long long m0,m1;
+    double ratiobelow,ratioabove;
+
+    t00 = cpucycles();
+    m0 = microseconds();
+    t01 = cpucycles();
+
+    for (j = 0;j < loops;++j) v = 0;
+
+    t10 = cpucycles();
+    m1 = microseconds();
+    t11 = cpucycles();
+
+    if (t01 < t00) continue;
+    if (t10 < t01) continue;
+    if (t11 < t10) continue;
+    if (m1 <= m0+2) continue;
+
+    ratiobelow = floor((1000000.0*(t10-t01))/(m1+1-m0));
+    ratioabove = ceil((1000000.0*(t11-t00))/(m1-m0-1));
+
+    printf("cpucycles observed persecond %.0lf...%.0lf with %lld loops %lld microseconds\n",ratiobelow,ratioabove,loops,m1-m0);
+  }
+}
+
+int main(int argc,char **argv)
+{
+  cpucycles_tracesetup();
+  printf("cpucycles version %s\n",cpucycles_version());
+  measure_cpucycles();
+  return 0;
+}
diff --git a/cpu-cycles/libcpucycles/compilers/default b/cpu-cycles/libcpucycles/compilers/default
new file mode 100644
index 0000000000..4f79c16a5e
--- /dev/null
+++ b/cpu-cycles/libcpucycles/compilers/default
@@ -0,0 +1,2 @@
+gcc -Wall -fPIC -fwrapv -O -fvisibility=hidden
+clang -Wall -fPIC -fwrapv -Qunused-arguments -O -fvisibility=hidden
diff --git a/cpu-cycles/libcpucycles/configure b/cpu-cycles/libcpucycles/configure
new file mode 100755
index 0000000000..679fd8a5e0
--- /dev/null
+++ b/cpu-cycles/libcpucycles/configure
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+
+import os
+import shutil
+import sys
+import platform
+import subprocess
+import tempfile
+
+prefix = '/usr/local'
+clean = True
+linktype = 'so'
+
+host = platform.machine()
+host = ''.join(c for c in host if c in '_0123456789abcdefghijklmnopqrstuvwxyz')
+
+if host == 'x86_64': host = 'amd64'
+if host == 'i386': host = 'x86'
+if host == 'i686': host = 'x86'
+
+if host.startswith('armv8') or host.startswith('aarch64'): host = 'arm64'
+if host.startswith('arm'): host = 'arm32'
+
+if host.startswith('riscv64'): host = 'riscv64'
+if host.startswith('riscv'): host = 'riscv32'
+
+if host.startswith('mips64'): host = 'mips64'
+if host.startswith('mips'): host = 'mips32'
+
+if host.startswith('powerpc64') or host.startswith('ppc64'): host = 'ppc64'
+if host.startswith('powerpc') or host.startswith('ppc'): host = 'ppc32'
+
+if host.startswith('sparcv9') or host.startswith('sun4u'): host = 'sparc64'
+if host.startswith('sparc') or host.startswith('sun'): host = 'sparc32'
+
+makefile = ''
+
+for arg in sys.argv[1:]:
+  if arg.startswith('--prefix='):
+    prefix = arg[9:]
+    continue
+  if arg.startswith('--host='):
+    host = arg[7:]
+    host = host.split('-')[0]
+    continue
+  if arg == '--clean':
+    clean = True
+    continue
+  if arg == '--noclean':
+    clean = False
+    continue
+  raise ValueError('unrecognized argument %s' % arg)
+
+echoargs = './configure'
+echoargs += ' --prefix=%s' % prefix
+echoargs += ' --host=%s' % host
+if clean: echoargs += ' --clean'
+if not clean: echoargs += ' --noclean'
+print(echoargs)
+
+if prefix[0] != '/':
+  raise ValueError('prefix %s is not an absolute path' % prefix)
+
+rpath = None
+# XXX: rpath = '%s/lib' % prefix
+
+if clean:
+  shutil.rmtree('build/%s' % host,ignore_errors=True)
+
+def dirlinksym(dir,source,target):
+  with tempfile.TemporaryDirectory(dir=dir) as t:
+    os.symlink(target,'%s/symlink' % t)
+    os.rename('%s/symlink' % t,'%s/%s' % (dir,source))
+
+os.makedirs('build/%s' % host,exist_ok=True)
+os.makedirs('build/%s/package/bin' % host,exist_ok=True)
+os.makedirs('build/%s/package/lib' % host,exist_ok=True)
+os.makedirs('build/%s/package/include' % host,exist_ok=True)
+
+if clean:
+  os.symlink('../..','build/%s/src' % host)
+
+# ----- build scripts
+
+os.makedirs('build/%s/scripts'%host,exist_ok=True)
+dirlinksym('build/%s/scripts'%host,'install','../src/scripts-build/install')
+
+# ----- compilers
+
+def compilerversion(c):
+  try:
+    p = subprocess.Popen(c.split()+['--version'],stdout=subprocess.PIPE,stderr=subprocess.STDOUT,universal_newlines=True)
+    out,err = p.communicate()
+    assert not err
+    assert not p.returncode
+    return out
+  except:
+    pass
+
+firstcompiler = None
+
+with open('compilers/default') as f:
+  for c in f.readlines():
+    c = c.strip()
+    cv = compilerversion(c)
+    if cv == None:
+      print('skipping default compiler %s' % c)
+      continue
+    print('using default compiler %s' % c)
+    firstcompiler = c
+    break
+
+if firstcompiler is None:
+  raise ValueError('did not find a working compiler')
+
+with open('build/%s/scripts/compiledefault' % host,'w') as f:
+  f.write('#!/bin/sh\n')
+  f.write('\n')
+  f.write('dir="$1"; shift\n')
+  f.write('base="$1"; shift\n')
+  f.write('ext="$1"; shift\n')
+  f.write('\n')
+  f.write('cd "$dir" && \\\n')
+  f.write('%s \\\n' % firstcompiler)
+  f.write('  "$@" \\\n')
+  f.write('  -c "$base.$ext"\n')
+os.chmod('build/%s/scripts/compiledefault' % host,0o755)
+
+# ----- libcpucycles
+
+os.makedirs('build/%s/cpucycles' % host,exist_ok=True)
+os.makedirs('build/%s/package/man/man3' % host,exist_ok=True)
+
+dirlinksym('build/%s/cpucycles'%host,'cpucycles.h','../src/cpucycles/cpucycles.h')
+dirlinksym('build/%s/cpucycles'%host,'cpucycles_internal.h','../src/cpucycles/cpucycles_internal.h')
+shutil.copy2('cpucycles/cpucycles.h','build/%s/package/include/cpucycles.h'%host)
+shutil.copy2('doc/man/cpucycles.3','build/%s/package/man/man3/cpucycles.3'%host)
+
+with open('build/%s/cpucycles/compile-ticks' % host,'w') as f:
+  f.write('#!/bin/sh\n')
+  f.write('arch="$1"; shift\n')
+  f.write('x="$1"; shift\n')
+  f.write('for source in try-"$arch"-"$x".c try-default-zero.c\n')
+  f.write('do\n')
+  f.write('  cp "$source" "$arch"-"$x".c\n')
+  f.write('  %s \\\n' % firstcompiler)
+  f.write('    -Dticks=cpucycles_ticks_"$arch"_"$x" \\\n')
+  f.write('    -Dticks_setup=cpucycles_ticks_"$arch"_"$x"_setup \\\n')
+  f.write('    -c "$arch"-"$x".c\n')
+  f.write('  case $? in\n')
+  f.write('    0) break ;;\n')
+  f.write('    111) exit 111 ;;\n')
+  f.write('    *) echo "skipping option that did not compile" ;;\n')
+  f.write('  esac\n')
+  f.write('done\n')
+os.chmod('build/%s/cpucycles/compile-ticks' % host,0o755)
+
+cpucyclesoptions = []
+cpucyclesofiles = []
+
+with open('cpucycles/options') as f:
+  for line in f:
+    line = line.strip()
+    if line == '': continue
+    if line[0] == '#': continue
+    base = line.split()[0]
+    if not os.path.exists('cpucycles/%s.c' % base): continue
+    cpucycles = base.split('-')
+    if len(cpucycles) != 2: continue
+    if cpucycles[0] not in (host,'default'): continue
+    cpucyclesoptions += [cpucycles]
+
+cpucyclesoptions += [['default','zero']] # must be last
+
+for cpucycles in cpucyclesoptions:
+  base = '-'.join(cpucycles)
+  cpucyclesofiles += ['cpucycles/%s.o' % base]
+  dirlinksym('build/%s/cpucycles'%host,'try-%s.c'%base,'../src/cpucycles/%s.c'%base)
+  M = 'cpucycles/%s.o: cpucycles/try-%s.c cpucycles/try-default-zero.c\n' % (base,base)
+  M += '\tcd cpucycles && ./compile-ticks %s %s\n' % tuple(cpucycles)
+  M += '\n'
+  makefile = M + makefile
+
+for fn in sorted(os.listdir('cpucycles')):
+  if not fn.endswith('.c'): continue
+  if '-' in fn: continue
+  base = fn[:-2]
+  cpucyclesofiles += ['cpucycles/%s.o' % base]
+  dirlinksym('build/%s/cpucycles'%host,fn,'../src/cpucycles/%s'%fn)
+  M = 'cpucycles/%s.o: cpucycles/%s.c\n' % (base,base)
+  M += '\tscripts/compiledefault cpucycles %s c\n' % base
+  M += '\n'
+  makefile = M + makefile
+
+with open('build/%s/cpucycles/options.inc' % host,'w') as f:
+  f.write('#define NUMOPTIONS %d\n' % len(cpucyclesoptions))
+  f.write('#define DEFAULTOPTION (NUMOPTIONS-1)\n')
+  f.write('\n')
+  for cpucycles in cpucyclesoptions:
+    f.write('extern long long cpucycles_ticks_%s_%s_setup(void);\n' % (cpucycles[0],cpucycles[1]))
+    f.write('extern long long cpucycles_ticks_%s_%s(void);\n' % (cpucycles[0],cpucycles[1]))
+  f.write('\n')
+  f.write('static struct {\n')
+  f.write('  const char *implementation;\n')
+  f.write('  long long (*ticks_setup)(void);\n')
+  f.write('  long long (*ticks)(void);\n')
+  f.write('} options[NUMOPTIONS] = {\n')
+  for cpucycles in cpucyclesoptions:
+    f.write('{ "%s-%s", cpucycles_ticks_%s_%s_setup, cpucycles_ticks_%s_%s },\n' % (cpucycles[0],cpucycles[1],cpucycles[0],cpucycles[1],cpucycles[0],cpucycles[1]))
+  f.write('} ;\n')
+
+dirlinksym('build/%s/scripts'%host,'staticlib','../src/scripts-build/staticlib')
+
+M = 'package/lib/libcpucycles.a: scripts/staticlib %s\n' % ' '.join(cpucyclesofiles)
+M += '\tscripts/staticlib %s\n' % ' '.join(cpucyclesofiles)
+M += '\n'
+makefile = M + makefile
+
+with open('build/%s/scripts/sharedlib' % host,'w') as f:
+  f.write('#!/bin/sh\n')
+  f.write('\n')
+  f.write('%s -shared \\\n' % firstcompiler)
+  if rpath:
+    f.write('  -Wl,-rpath=%s \\\n' % rpath)
+  f.write('  -Wl,-soname,libcpucycles.so.1 \\\n')
+  f.write('  -o package/lib/libcpucycles.so.1 \\\n')
+  f.write('  "$@"\n')
+  f.write('chmod 644 package/lib/libcpucycles.so.1\n')
+os.chmod('build/%s/scripts/sharedlib' % host,0o755)
+
+M = 'package/lib/libcpucycles.so.1: scripts/sharedlib %s\n' % ' '.join(cpucyclesofiles)
+M += '\tscripts/sharedlib %s\n' % ' '.join(cpucyclesofiles)
+M += '\n'
+makefile = M + makefile
+
+M = 'package/lib/libcpucycles.so: package/lib/libcpucycles.so.1\n'
+M += '\trm -f package/lib/libcpucycles.so\n'
+M += '\tln -s libcpucycles.so.1 package/lib/libcpucycles.so\n'
+M += '\n'
+makefile = M + makefile
+
+# ----- command
+
+os.makedirs('build/%s/command'%host)
+for c in sorted(os.listdir('command')):
+  dirlinksym('build/%s/command'%host,c,'../src/command/%s'%c)
+dirlinksym('build/%s/command'%host,'bin','../package/bin')
+dirlinksym('build/%s/command'%host,'lib','../package/lib')
+dirlinksym('build/%s/command'%host,'include','../package/include')
+
+with open('build/%s/command/link' % host,'w') as f:
+  f.write('#!/bin/sh\n')
+  f.write('target="$1"; shift\n')
+  f.write('%s \\\n' % firstcompiler)
+  f.write('  -o "$target" "$@"\n')
+os.chmod('build/%s/command/link' % host,0o755)
+
+commands = []
+
+for fn in sorted(os.listdir('command')):
+  if not fn.endswith('.c'): continue
+
+  libs = ['libcpucycles']
+
+  base = fn[:-2]
+  M = 'command/%s.o: command/%s.c\n' % (base,base)
+  M += '\tscripts/compiledefault command %s c -I include\n' % base
+  M += '\n'
+  makefile = M + makefile
+  M = 'package/bin/%s: command/%s.o%s\n' % (base,base,''.join(' package/lib/%s.%s' % (x,linktype) for x in libs))
+  M += '\tcd command && ./link bin/%s %s.o%s -lm -lrt\n' % (base,base,''.join(' lib/%s.%s' % (x,linktype) for x in libs))
+  M += '\n'
+  makefile = M + makefile
+  commands += ['package/bin/%s' % base]
+
+M = 'commands: %s\n' % ' '.join(commands)
+M += '\n'
+makefile = M + makefile
+
+# ----- make install
+
+M = 'install: scripts/install default\n'
+M += '\tscripts/install %s\n' % prefix
+M += '\n'
+makefile = M + makefile
+
+# ----- make default
+
+M = 'default: package/lib/libcpucycles.a package/lib/libcpucycles.so package/lib/libcpucycles.so.1 \\\n'
+M += 'commands\n'
+M += '\n'
+makefile = M + makefile
+
+with open('build/%s/Makefile' % host,'w') as f:
+  f.write(makefile)
+
+# ----- build/0, build/Makefile
+
+dirlinksym('build','0',host)
+
+with open('build/Makefile','w') as f:
+  f.write('default:\n')
+  f.write('\tcd %s && $(MAKE)\n' % host)
+  f.write('\n')
+  f.write('install:\n')
+  f.write('\tcd %s && $(MAKE) install\n' % host)
+  f.write('\n')
+  f.write('clean:\n')
+  f.write('\trm -r %s\n' % host)
diff --git a/cpu-cycles/libcpucycles/cpucycles/amd64-pmc.c b/cpu-cycles/libcpucycles/cpucycles/amd64-pmc.c
new file mode 100644
index 0000000000..732f19d198
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/amd64-pmc.c
@@ -0,0 +1,53 @@
+// version 20230105
+// public domain
+// djb
+// adapted from supercop/cpucycles/amd64rdpmc.c
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <linux/perf_event.h>
+#include "cpucycles_internal.h"
+
+static struct perf_event_attr attr;
+static int fdperf = -1;
+static struct perf_event_mmap_page *buf = 0;
+
+long long ticks(void)
+{
+  long long result;
+  unsigned int seq;
+  long long index;
+  long long offset;
+
+  do {
+    seq = buf->lock;
+    asm volatile("" ::: "memory");
+    index = buf->index;
+    offset = buf->offset;
+    asm volatile("rdpmc;shlq $32,%%rdx;orq %%rdx,%%rax"
+      : "=a"(result) : "c"(index-1) : "%rdx");
+    asm volatile("" ::: "memory");
+  } while (buf->lock != seq);
+
+  result += offset;
+  result &= 0xffffffffffff;
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (fdperf == -1) {
+    attr.type = PERF_TYPE_HARDWARE;
+    attr.config = PERF_COUNT_HW_CPU_CYCLES;
+    attr.exclude_kernel = 1;
+    fdperf = syscall(__NR_perf_event_open,&attr,0,-1,-1,0);
+    if (fdperf == -1) return cpucycles_SKIP;
+    buf = mmap(NULL,sysconf(_SC_PAGESIZE),PROT_READ,MAP_SHARED,fdperf,0);
+  }
+
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_CYCLECOUNTER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/amd64-tsc.c b/cpu-cycles/libcpucycles/cpucycles/amd64-tsc.c
new file mode 100644
index 0000000000..921bbec494
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/amd64-tsc.c
@@ -0,0 +1,22 @@
+// version 20230105
+// public domain
+// djb
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include "cpucycles_internal.h"
+
+long long ticks(void)
+{
+  return __rdtsc();
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_MAYBECYCLECOUNTER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/amd64-tscasm.c b/cpu-cycles/libcpucycles/cpucycles/amd64-tscasm.c
new file mode 100644
index 0000000000..90af94dd78
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/amd64-tscasm.c
@@ -0,0 +1,20 @@
+// version 20230105
+// public domain
+// djb
+// adapted from supercop/cpucycles/amd64tscfreq.c
+
+#include "cpucycles_internal.h"
+
+long long ticks(void)
+{
+  unsigned long long result;
+  asm volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
+    : "=a"(result) :: "%rdx");
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_MAYBECYCLECOUNTER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/arm32-cortex.c b/cpu-cycles/libcpucycles/cpucycles/arm32-cortex.c
new file mode 100644
index 0000000000..d4a23ae793
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/arm32-cortex.c
@@ -0,0 +1,27 @@
+// version 20230105
+// public domain
+// djb
+// adapted from supercop/cpucycles/cortex.c
+
+#include "cpucycles_internal.h"
+
+long long ticks(void)
+{
+  unsigned int result;
+  asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(result));
+  return (unsigned long long) result;
+}
+
+static long enable(void)
+{
+  asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(17));
+  asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(0x8000000f));
+  asm volatile("mcr p15, 0, %0, c9, c12, 3" :: "r"(0x8000000f));
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(enable)) return cpucycles_SKIP;
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_EXTEND32;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/arm64-pmc.c b/cpu-cycles/libcpucycles/cpucycles/arm64-pmc.c
new file mode 100644
index 0000000000..d38f4e293f
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/arm64-pmc.c
@@ -0,0 +1,19 @@
+// version 20230105
+// public domain
+// djb
+// adapted from supercop/cpucycles/armv8.c
+
+#include "cpucycles_internal.h"
+
+long long ticks(void)
+{
+  long long result;
+  asm volatile("mrs %0, PMCCNTR_EL0" : "=r" (result));
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_CYCLECOUNTER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/arm64-vct.c b/cpu-cycles/libcpucycles/cpucycles/arm64-vct.c
new file mode 100644
index 0000000000..a04b59e108
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/arm64-vct.c
@@ -0,0 +1,19 @@
+// version 20230105
+// public domain
+// djb
+// adapted from supercop/cpucycles/vct.c
+
+#include "cpucycles_internal.h"
+
+long long ticks(void)
+{
+  long long result;
+  asm volatile("mrs %0, CNTVCT_EL0" : "=r" (result));
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_FINDMULTIPLIER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/cpucycles.h b/cpu-cycles/libcpucycles/cpucycles/cpucycles.h
new file mode 100644
index 0000000000..b6dfa6ee94
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/cpucycles.h
@@ -0,0 +1,25 @@
+// version 20230115
+// public domain
+// djb
+
+// 20230115 djb: cpucycles_version()
+// 20230114 djb: improve punctuation
+
+#ifndef cpucycles_h
+#define cpucycles_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern long long (*cpucycles)(void) __attribute__((visibility("default")));
+extern const char *cpucycles_implementation(void) __attribute__((visibility("default")));
+extern const char *cpucycles_version(void) __attribute__((visibility("default")));
+extern long long cpucycles_persecond(void) __attribute__((visibility("default")));
+extern void cpucycles_tracesetup(void) __attribute__((visibility("default")));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/cpu-cycles/libcpucycles/cpucycles/cpucycles_internal.h b/cpu-cycles/libcpucycles/cpucycles/cpucycles_internal.h
new file mode 100644
index 0000000000..61aa3dd836
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/cpucycles_internal.h
@@ -0,0 +1,20 @@
+// version 20230105
+// public domain
+// djb
+
+#ifndef cpucycles_internal_h
+#define cpucycles_internal_h
+
+extern long long cpucycles_init(void);
+extern long long cpucycles_microseconds(void);
+extern int cpucycles_works(long long (*)(void));
+
+// return values from ticks_setup():
+#define cpucycles_SKIP (0)
+#define cpucycles_CYCLECOUNTER (-1)
+#define cpucycles_MAYBECYCLECOUNTER (-2)
+#define cpucycles_FINDMULTIPLIER (-3)
+#define cpucycles_EXTEND32 (-32)
+// and positive values mean known ticks/second
+
+#endif
diff --git a/cpu-cycles/libcpucycles/cpucycles/default-gettimeofday.c b/cpu-cycles/libcpucycles/cpucycles/default-gettimeofday.c
new file mode 100644
index 0000000000..f26ff4c37e
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/default-gettimeofday.c
@@ -0,0 +1,15 @@
+// version 20230105
+// public domain
+// djb
+
+#include "cpucycles_internal.h"
+
+long long ticks_setup(void)
+{
+  return 1000000;
+}
+
+long long ticks(void)
+{
+  return cpucycles_microseconds();
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/default-mach.c b/cpu-cycles/libcpucycles/cpucycles/default-mach.c
new file mode 100644
index 0000000000..e32f6f1cad
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/default-mach.c
@@ -0,0 +1,17 @@
+// version 20230105
+// public domain
+// djb
+
+#include <mach/mach_time.h>
+#include "cpucycles_internal.h"
+
+long long ticks(void)
+{
+  return mach_absolute_time();
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_FINDMULTIPLIER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/default-monotonic.c b/cpu-cycles/libcpucycles/cpucycles/default-monotonic.c
new file mode 100644
index 0000000000..f609811b2b
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/default-monotonic.c
@@ -0,0 +1,23 @@
+// version 20230105
+// public domain
+// djb
+// adapted from supercop/cpucycles/monotonic.c
+
+#include <time.h>
+#include <sys/time.h>
+
+long long ticks_setup(void)
+{
+  return 1000000000;
+}
+
+long long ticks(void)
+{
+  struct timespec t;
+  long long result;
+  clock_gettime(CLOCK_MONOTONIC,&t);
+  result = t.tv_sec;
+  result *= 1000000000;
+  result += t.tv_nsec;
+  return result;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/default-perfevent.c b/cpu-cycles/libcpucycles/cpucycles/default-perfevent.c
new file mode 100644
index 0000000000..615c156866
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/default-perfevent.c
@@ -0,0 +1,101 @@
+// version 20230106
+// public domain
+// djb
+// adapted from supercop/cpucycles/perfevent.c
+
+// 20230106 djb: read() into int64_t instead of long long
+// 20230106 djb: add comment on RUNNING/ENABLED
+
+/*
+This code intentionally avoids dividing by the
+PERF_FORMAT_TOTAL_TIME_RUNNING/ENABLED ratio.
+
+The motivation for that ratio is as follows:
+
+* A typical CPU has a limited number of performance-monitoring
+  counters active at once. For example, there are 8 "programmable"
+  counters on Intel Skylake.
+
+* "perf stat" allows the user to enable more counters. The OS kernel
+  periodically (e.g., every millisecond) changes the limited number of
+  active hardware counters to a new subset of the enabled counters, and
+  "perf stat" reports PERF_FORMAT_TOTAL_TIME_RUNNING/ENABLED for each
+  counter, the fraction of time spent with that counter running.
+
+For long-running programs, dividing the hardware counter by
+RUNNING/ENABLED usually produces a reasonable estimate of what the count
+would have been without competition from other counters.
+
+A fixable problem with this multiplexing of counters is that the kernel
+appears to simply cycle through counters, so unlucky programs can
+trigger moiré effects. The fix is to select random subsets of counters.
+
+A more fundamental problem is that cpucycles() has to be usable for
+timing short subroutines, including subroutines so short that the OS has
+no opportunity to change from one selection of counters to another. Say
+RUNNING is 0; should cpucycles() then divide by 0?
+
+If a caller runs cpucycles(), X(), cpucycles(), X(), etc., and the cycle
+counter happens to be enabled for only 80% of the runs of X(), then
+simply computing the median difference of adjacent cycle counts, with no
+scaling, will filter out the zeros and correctly compute the cost of X.
+Averages won't (without scaling), but averages have other problems, such
+as being heavily influenced by interrupts. (Omitting kernel time from
+perf results does not remove the influence of interrupts on caches.)
+
+Given the importance of cycle counting, it is better to have cycle
+counters always running. For example, on Skylake, Intel provides the 8
+"programmable" counters on top of a separate cycle counter ("fixed
+counter 1"), so there is no good reason for the kernel to waste a
+"programmable" counter on a cycle counter, there is no good reason to
+turn the cycle counter off, and there is no good reason for RUNNING to
+be below ENABLED for the cycle counter.
+
+Of course, applications that use just one performance counter at a time
+don't have to worry about kernels getting this wrong, and don't have to
+worry about the possibility of getting noisy or invalid results on CPUs
+that have heavier constraints on the number of simultaneous counters.
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <linux/perf_event.h>
+#include "cpucycles_internal.h"
+
+static int fddev = -1;
+
+long long ticks(void)
+{
+  int64_t result;
+
+  if (read(fddev,&result,sizeof result) < sizeof result) return 0;
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (fddev == -1) {
+    static struct perf_event_attr attr;
+
+    memset(&attr,0,sizeof attr);
+    attr.type = PERF_TYPE_HARDWARE;
+    attr.size = sizeof(struct perf_event_attr);
+    attr.config = PERF_COUNT_HW_CPU_CYCLES;
+    attr.disabled = 1;
+    attr.exclude_kernel = 1;
+    attr.exclude_hv = 1;
+
+    fddev = syscall(__NR_perf_event_open,&attr,0,-1,-1,0);
+    if (fddev == -1) return cpucycles_SKIP;
+
+    ioctl(fddev,PERF_EVENT_IOC_RESET,0);
+    ioctl(fddev,PERF_EVENT_IOC_ENABLE,0);
+  }
+
+  return cpucycles_MAYBECYCLECOUNTER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/default-zero.c b/cpu-cycles/libcpucycles/cpucycles/default-zero.c
new file mode 100644
index 0000000000..98b49fb535
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/default-zero.c
@@ -0,0 +1,15 @@
+// version 20230105
+// public domain
+// djb
+
+#include "cpucycles_internal.h"
+
+long long ticks_setup(void)
+{
+  return cpucycles_SKIP;
+}
+
+long long ticks(void)
+{
+  return 0;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/mips64-cc.c b/cpu-cycles/libcpucycles/cpucycles/mips64-cc.c
new file mode 100644
index 0000000000..7ab5221f61
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/mips64-cc.c
@@ -0,0 +1,33 @@
+// version 20230105
+// public domain
+// djb
+// adapted from supercop/cpucycles/mips.c
+
+// mips32 release 2 instruction rdhwr
+// 7c02103b: read hwr#2 (cycle count) into $2
+// 7c02183b: read hwr#3 (cycle-count multiplier) into $2
+
+#include "cpucycles_internal.h"
+
+static unsigned int multiplier = 0;
+
+static long long multiplier_set(void)
+{
+  asm volatile(".long 0x7c02183b; move %0,$2" : "=r"(multiplier) : : "$2");
+  return multiplier;
+}
+
+long long ticks(void)
+{
+  unsigned int result;
+  asm volatile(".long 0x7c02103b; move %0,$2" : "=r"(result) :: "$2");
+  result *= multiplier;
+  return (unsigned long long) result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(multiplier_set)) return cpucycles_SKIP;
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_EXTEND32;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/options b/cpu-cycles/libcpucycles/cpucycles/options
new file mode 100644
index 0000000000..f29fed57c9
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/options
@@ -0,0 +1,19 @@
+amd64-pmc
+amd64-tsc
+amd64-tscasm
+arm32-cortex
+arm64-pmc
+arm64-vct
+mips64-cc
+ppc32-mftb
+ppc64-mftb
+riscv32-rdcycle
+riscv64-rdcycle
+s390x-stckf
+sparc64-rdtick
+x86-tsc
+x86-tscasm
+default-perfevent
+default-mach
+default-monotonic
+default-gettimeofday
diff --git a/cpu-cycles/libcpucycles/cpucycles/ppc32-mftb.c b/cpu-cycles/libcpucycles/cpucycles/ppc32-mftb.c
new file mode 100644
index 0000000000..93e1deae04
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/ppc32-mftb.c
@@ -0,0 +1,30 @@
+// version 20230105
+// public domain
+// djb
+// adapted from supercop/cpucycles/powerpccpuinfo.c
+
+#include "cpucycles_internal.h"
+
+long long ticks(void)
+{
+  unsigned int high, low, newhigh;
+  unsigned long long result;
+
+  do {
+    asm volatile(
+      "mftbu %0; mftb %1; mftbu %2"
+      : "=r" (high), "=r" (low), "=r" (newhigh)
+    );
+  } while (newhigh != high);
+
+  result = high;
+  result <<= 32;
+  result |= low;
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_FINDMULTIPLIER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/ppc64-mftb.c b/cpu-cycles/libcpucycles/cpucycles/ppc64-mftb.c
new file mode 100644
index 0000000000..93e1deae04
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/ppc64-mftb.c
@@ -0,0 +1,30 @@
+// version 20230105
+// public domain
+// djb
+// adapted from supercop/cpucycles/powerpccpuinfo.c
+
+#include "cpucycles_internal.h"
+
+long long ticks(void)
+{
+  unsigned int high, low, newhigh;
+  unsigned long long result;
+
+  do {
+    asm volatile(
+      "mftbu %0; mftb %1; mftbu %2"
+      : "=r" (high), "=r" (low), "=r" (newhigh)
+    );
+  } while (newhigh != high);
+
+  result = high;
+  result <<= 32;
+  result |= low;
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_FINDMULTIPLIER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/riscv32-rdcycle.c b/cpu-cycles/libcpucycles/cpucycles/riscv32-rdcycle.c
new file mode 100644
index 0000000000..73e581cbba
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/riscv32-rdcycle.c
@@ -0,0 +1,39 @@
+// version 20230105
+// public domain
+// djb
+
+// adapted from supercop/cpucycles/riscv.c
+// which has code from djb and Romain Dolbeau
+
+#include "cpucycles_internal.h"
+
+#ifndef __riscv_xlen
+#error this code is only for riscv platforms
+#endif
+
+#if __riscv_xlen != 32
+#error this code is only for riscv32 platforms
+#endif
+
+long long ticks(void)
+{
+  unsigned int low, high, newhigh;
+  unsigned long long result;
+
+  asm volatile( "start%=:\n"
+                "rdcycleh %0\n"
+                "rdcycle %1\n"
+                "rdcycleh %2\n"
+                "bne %0, %2, start%=\n"
+                : "=r"(high), "=r"(low), "=r"(newhigh));
+  result = high;
+  result <<= 32;
+  result |= low;
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_CYCLECOUNTER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/riscv64-rdcycle.c b/cpu-cycles/libcpucycles/cpucycles/riscv64-rdcycle.c
new file mode 100644
index 0000000000..5c8b21bb22
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/riscv64-rdcycle.c
@@ -0,0 +1,29 @@
+// version 20230105
+// public domain
+// djb
+
+// adapted from supercop/cpucycles/riscv.c
+// which has code from djb and Romain Dolbeau
+
+#include "cpucycles_internal.h"
+
+#ifndef __riscv_xlen
+#error this code is only for riscv platforms
+#endif
+
+#if __riscv_xlen != 64
+#error this code is only for riscv64 platforms
+#endif
+
+long long ticks(void)
+{
+  long long result;
+  asm volatile("rdcycle %0" : "=r" (result));
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_CYCLECOUNTER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/s390x-stckf.c b/cpu-cycles/libcpucycles/cpucycles/s390x-stckf.c
new file mode 100644
index 0000000000..0688403fe9
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/s390x-stckf.c
@@ -0,0 +1,20 @@
+// version 20230106
+// public domain
+// djb
+
+// adapted from sparc64-rdtick.c
+
+#include "cpucycles_internal.h"
+
+long long ticks(void)
+{
+  long long result;
+  asm volatile("stckf 0(%0)" :: "a"(&result) : "memory","cc");
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return 4096000000; // manual says 2^12 per microsecond
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/sparc64-rdtick.c b/cpu-cycles/libcpucycles/cpucycles/sparc64-rdtick.c
new file mode 100644
index 0000000000..e0f1e1dce7
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/sparc64-rdtick.c
@@ -0,0 +1,24 @@
+// version 20230105
+// public domain
+// djb
+
+// adapted from supercop/cpucycles/sparccpuinfo.c
+
+#include "cpucycles_internal.h"
+
+#if defined(__sparcv8) || defined(__sparcv8plus)
+#error this code is only for sparc64 platforms
+#endif
+
+long long ticks(void)
+{
+  long long result;
+  asm volatile("rd %%tick,%0" : "=r" (result));
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_CYCLECOUNTER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/wrapper.c b/cpu-cycles/libcpucycles/cpucycles/wrapper.c
new file mode 100644
index 0000000000..6f284d93ea
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/wrapper.c
@@ -0,0 +1,420 @@
+// version 20230115
+// public domain
+// djb
+// includes some pieces adapted from supercop
+
+// 20230115 djb: cpucycles_version()
+// 20230106 djb: support "cpu MHz static" (ibm z15)
+
+#include <time.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <setjmp.h>
+#include "cpucycles.h"
+#include "cpucycles_internal.h"
+
+static int tracesetup = 0;
+
+void cpucycles_tracesetup(void)
+{
+  tracesetup = 1;
+}
+
+static jmp_buf crash_jmp;
+
+static void crash(int s)
+{
+  siglongjmp(crash_jmp,0);
+}
+
+int cpucycles_works(long long (*ticks)(void))
+{
+  volatile int result = 0;
+  struct sigaction old_sigill;
+  struct sigaction old_sigfpe;
+  struct sigaction old_sigbus;
+  struct sigaction old_sigsegv;
+  struct sigaction crash_action;
+
+  memset(&crash_action,0,sizeof crash_action);
+  crash_action.sa_handler = crash;
+
+  sigaction(SIGILL,0,&old_sigill);
+  sigaction(SIGFPE,0,&old_sigfpe);
+  sigaction(SIGBUS,0,&old_sigbus);
+  sigaction(SIGSEGV,0,&old_sigsegv);
+
+  if (!sigsetjmp(crash_jmp,1)) {
+    sigaction(SIGILL,&crash_action,0);
+    sigaction(SIGFPE,&crash_action,0);
+    sigaction(SIGBUS,&crash_action,0);
+    sigaction(SIGSEGV,&crash_action,0);
+    ticks();
+    result = 1;
+  }
+
+  sigaction(SIGILL,&old_sigill,0);
+  sigaction(SIGFPE,&old_sigfpe,0);
+  sigaction(SIGBUS,&old_sigbus,0);
+  sigaction(SIGSEGV,&old_sigsegv,0);
+
+  return result;
+}
+
+static double osfreq(void)
+{
+  FILE *f;
+  char *x;
+  double result;
+  int s;
+
+  f = fopen("/etc/cpucyclespersecond", "r");
+  if (f) {
+    s = fscanf(f,"%lf",&result);
+    fclose(f);
+    if (s > 0) return result;
+  }
+
+  f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed", "r");
+  if (f) {
+    s = fscanf(f,"%lf",&result);
+    fclose(f);
+    if (s > 0) return 1000.0 * result;
+  }
+
+  f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", "r");
+  if (f) {
+    s = fscanf(f,"%lf",&result);
+    fclose(f);
+    if (s > 0) return 1000.0 * result;
+  }
+
+  f = fopen("/sys/devices/system/cpu/cpu0/clock_tick", "r");
+  if (f) {
+    s = fscanf(f,"%lf",&result);
+    fclose(f);
+    if (s > 0) return result;
+  }
+
+  f = fopen("/proc/cpuinfo","r");
+  if (f) {
+    for (;;) {
+      s = fscanf(f,"cpu MHz : %lf",&result);
+      if (s > 0) break;
+      if (s == 0) s = fscanf(f,"%*[^\n]\n");
+      if (s < 0) { result = 0; break; }
+    }
+    fclose(f);
+    if (result) return 1000000.0 * result;
+  }
+
+  f = fopen("/proc/cpuinfo","r");
+  if (f) {
+    for (;;) {
+      s = fscanf(f,"clock : %lf",&result);
+      if (s > 0) break;
+      if (s == 0) s = fscanf(f,"%*[^\n]\n");
+      if (s < 0) { result = 0; break; }
+    }
+    fclose(f);
+    if (result) return 1000000.0 * result;
+  }
+
+  f = fopen("/proc/cpuinfo","r");
+  if (f) {
+    for (;;) {
+      s = fscanf(f,"cpu MHz static : %lf",&result);
+      if (s > 0) break;
+      if (s == 0) s = fscanf(f,"%*[^\n]\n");
+      if (s < 0) { result = 0; break; }
+    }
+    fclose(f);
+    if (result) return 1000000.0 * result;
+  }
+
+  f = popen("sysctl hw.cpufrequency 2>/dev/null","r");
+  if (f) {
+    s = fscanf(f,"hw.cpufrequency: %lf",&result);
+    pclose(f);
+    if (s > 0) if (result > 0) return result;
+  }
+
+  f = popen("/usr/sbin/lsattr -E -l proc0 -a frequency 2>/dev/null","r");
+  if (f) {
+    s = fscanf(f,"frequency %lf",&result);
+    pclose(f);
+    if (s > 0) return result;
+  }
+
+  f = popen("/usr/sbin/psrinfo -v 2>/dev/null","r");
+  if (f) {
+    for (;;) {
+      s = fscanf(f," The %*s processor operates at %lf MHz",&result);
+      if (s > 0) break;
+      if (s == 0) s = fscanf(f,"%*[^\n]\n");
+      if (s < 0) { result = 0; break; }
+    }
+    pclose(f);
+    if (result) return 1000000.0 * result;
+  }
+
+  x = getenv("cpucyclespersecond");
+  if (x) {
+    s = sscanf(x,"%lf",&result);
+    if (s > 0) return result;
+  }
+
+  return 2399987654.0;
+}
+
+static long long persecond = 0;
+static const char *implementation = "none";
+
+long long (*cpucycles)(void) = cpucycles_init;
+
+const char *cpucycles_implementation(void)
+{
+  cpucycles();
+  return implementation;
+}
+
+long long cpucycles_persecond(void)
+{
+  cpucycles();
+  return persecond;
+}
+
+const char *cpucycles_version(void)
+{
+  return "20230115";
+}
+
+// ----- cycle counter scaled from ticks
+
+static double cpucycles_scaled_scaling = 0;
+static long long cpucycles_scaled_offset = 0;
+static long long (*cpucycles_scaled_from)(void) = 0;
+
+static long long cpucycles_scaled(void)
+{
+  return (cpucycles_scaled_from()-cpucycles_scaled_offset)*cpucycles_scaled_scaling;
+}
+
+// ----- cycle counter extended from 32-bit ticks
+
+static long long (*cpucycles_extend32_from)(void) = 0;
+
+static uint32_t cpucycles_extend32_prev_ticks;
+static long long cpucycles_extend32_prev_us;
+static long long cpucycles_extend32_prev_cycles;
+
+static void cpucycles_extend32_setup(void)
+{
+  long long (*ticks)(void) = cpucycles_extend32_from;
+  cpucycles_extend32_prev_ticks = ticks();
+  cpucycles_extend32_prev_us = cpucycles_microseconds();
+  cpucycles_extend32_prev_cycles = 0;
+}
+
+static long long cpucycles_extend32(void)
+{
+  long long (*ticks)(void) = cpucycles_extend32_from;
+
+  uint32_t new_ticks = ticks();
+  unsigned long long delta_ticks = new_ticks-cpucycles_extend32_prev_ticks;
+  long long new_us = cpucycles_microseconds();
+  long long delta_us = new_us-cpucycles_extend32_prev_us;
+
+  // assume that number of cycles cannot increase by 2^32 in 2ms
+
+  if (delta_us < 1000)
+    return cpucycles_extend32_prev_cycles+delta_ticks;
+
+  cpucycles_extend32_prev_ticks = new_ticks;
+  cpucycles_extend32_prev_us = new_us;
+
+  if (delta_us >= 2000) {
+    long long target = (delta_us*0.000001)*persecond;
+    while (delta_ticks+2147483648ULL < target)
+      delta_ticks += 4294967296ULL;
+  }
+
+  return cpucycles_extend32_prev_cycles += delta_ticks;
+}
+
+// ----- estimating cycles per tick
+
+long long cpucycles_microseconds(void)
+{
+  struct timeval t;
+  long long result;
+  gettimeofday(&t,(struct timezone *) 0);
+  result = t.tv_sec;
+  result *= 1000000;
+  result += t.tv_usec;
+  return result;
+}
+
+static double estimate_cyclespertick(long long (*ticks)(void))
+{
+  long long t0,t1,us0,us1;
+
+  t0 = ticks();
+  us0 = cpucycles_microseconds();
+  do {
+    t1 = ticks();
+    us1 = cpucycles_microseconds();
+  } while (us1-us0 < 10000 || t1-t0 < 1000);
+  if (t1 <= t0) return 0;
+  t1 -= t0;
+  us1 -= us0;
+  return (persecond * 0.000001 * (double) us1) / (double) t1;
+}
+
+// ----- selecting an option
+
+#include "options.inc"
+
+#define CALLS 1000
+#define ESTIMATES 3
+
+long long cpucycles_init(void)
+{
+  long long precision[NUMOPTIONS];
+  double scaling[NUMOPTIONS];
+  int only32[NUMOPTIONS];
+  long long bestprecision;
+  long long bestopt;
+  long long opt;
+
+  persecond = osfreq();
+
+  for (opt = 0;opt < NUMOPTIONS;++opt) {
+    long long freq = options[opt].ticks_setup();
+    long long tries;
+
+    precision[opt] = 0;
+    scaling[opt] = 0;
+    only32[opt] = 0;
+
+    if (freq > 0) {
+      scaling[opt] = persecond*1.0/freq;
+    } else if (freq == cpucycles_CYCLECOUNTER) {
+      scaling[opt] = 1.0;
+    } else if (freq == cpucycles_EXTEND32) {
+      only32[opt] = 1;
+      scaling[opt] = 1.0;
+    } else if (freq == cpucycles_MAYBECYCLECOUNTER) {
+      scaling[opt] = 1.0;
+    } else if (freq == cpucycles_FINDMULTIPLIER) {
+      int ok = 0;
+      double denom;
+      long long loop;
+
+      for (denom = 1;denom <= 1024;denom += denom) {
+        double est[ESTIMATES];
+        for (loop = 0;loop < ESTIMATES;++loop)
+          est[loop] = denom*estimate_cyclespertick(options[opt].ticks);
+        scaling[opt] = (double) (long long) est[0];
+        if (scaling[opt] < est[0]-0.5) scaling[opt] += 1;
+        if (scaling[opt] > est[0]+0.5) scaling[opt] -= 1;
+        ok = 1;
+        for (loop = 0;loop < ESTIMATES;++loop) {
+          if (est[loop]-scaling[opt] > 0.1) ok = 0;
+          if (scaling[opt]-est[loop] > 0.1) ok = 0;
+        }
+        if (ok) {
+          scaling[opt] /= denom;
+          break;
+        }
+        scaling[opt] = 0;
+      }
+      if (!ok) continue;
+    } else {
+      continue;
+    }
+
+    for (tries = 0;tries < 10;++tries) {
+      long long t[CALLS+1];
+      long long ok = 1;
+      long long i;
+
+      if (scaling[opt] == 1.0) {
+        for (i = 0;i <= CALLS;++i)
+          t[i] = options[opt].ticks();
+      } else {
+        double scalingopt = scaling[opt];
+        long long offset = options[opt].ticks();
+        for (i = 0;i <= CALLS;++i)
+          t[i] = (options[opt].ticks()-offset)*scalingopt;
+      }
+      for (i = 0;i < CALLS;++i)
+        if (t[i] > t[i+1])
+          ok = 0;
+      if (t[0] == t[CALLS])
+        ok = 0;
+
+      if (ok) {
+        long long smallestdiff = 0;
+        for (i = 0;i < CALLS;++i) {
+          long long diff = t[i+1]-t[i];
+          if (diff <= 0) continue;
+          if (smallestdiff == 0 || diff < smallestdiff)
+            smallestdiff = diff;
+        }
+        precision[opt] = smallestdiff;
+
+        // tilt selection towards more robust counters
+        if (freq != cpucycles_CYCLECOUNTER && freq != cpucycles_EXTEND32)
+          precision[opt] += 100;
+        if (freq > 0)
+          precision[opt] += 100;
+
+        break;
+      }
+
+      // otherwise keep trying
+      // since !ok can be caused by overflow
+      // or by core swap
+    }
+  }
+
+  if (tracesetup) {
+    for (opt = 0;opt < NUMOPTIONS;++opt)
+      printf("cpucycles tracesetup %lld %s precision %lld scaling %lf only32 %d\n"
+        ,opt,options[opt].implementation,precision[opt],scaling[opt],only32[opt]);
+  }
+
+  bestopt = DEFAULTOPTION;
+  bestprecision = 0;
+  for (opt = 0;opt < NUMOPTIONS;++opt)
+    if (precision[opt] > 0)
+      if (!bestprecision || precision[opt] < bestprecision) {
+        bestopt = opt;
+        bestprecision = precision[opt];
+      }
+
+  implementation = options[bestopt].implementation;
+  
+  if (scaling[bestopt] == 1.0) {
+    if (only32[bestopt]) {
+      cpucycles_extend32_from = options[bestopt].ticks;
+      cpucycles_extend32_setup();
+      cpucycles = cpucycles_extend32;
+    } else {
+      cpucycles = options[bestopt].ticks;
+    }
+  } else {
+    cpucycles_scaled_scaling = scaling[bestopt];
+    cpucycles_scaled_from = options[bestopt].ticks;
+    cpucycles_scaled_offset = cpucycles_scaled_from();
+    cpucycles = cpucycles_scaled;
+  }
+
+  return cpucycles();
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/x86-tsc.c b/cpu-cycles/libcpucycles/cpucycles/x86-tsc.c
new file mode 100644
index 0000000000..921bbec494
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/x86-tsc.c
@@ -0,0 +1,22 @@
+// version 20230105
+// public domain
+// djb
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+#include "cpucycles_internal.h"
+
+long long ticks(void)
+{
+  return __rdtsc();
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_MAYBECYCLECOUNTER;
+}
diff --git a/cpu-cycles/libcpucycles/cpucycles/x86-tscasm.c b/cpu-cycles/libcpucycles/cpucycles/x86-tscasm.c
new file mode 100644
index 0000000000..f975e7ea39
--- /dev/null
+++ b/cpu-cycles/libcpucycles/cpucycles/x86-tscasm.c
@@ -0,0 +1,22 @@
+// version 20230105
+// public domain
+// djb
+
+#include "cpucycles_internal.h"
+
+#ifndef __i386__
+#error this code is only for 32-bit x86 platforms
+#endif
+
+long long ticks(void)
+{
+  long long result;
+  asm volatile(".byte 15;.byte 49" : "=A" (result));
+  return result;
+}
+
+long long ticks_setup(void)
+{
+  if (!cpucycles_works(ticks)) return cpucycles_SKIP;
+  return cpucycles_MAYBECYCLECOUNTER;
+}
diff --git a/cpu-cycles/libcpucycles/doc/api.md b/cpu-cycles/libcpucycles/doc/api.md
new file mode 100644
index 0000000000..32c3707dbe
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/api.md
@@ -0,0 +1,47 @@
+### NAME
+
+cpucycles - count CPU cycles
+
+### SYNOPSIS
+
+    #include <cpucycles.h>
+
+    long long count = cpucycles();
+    long long persecond = cpucycles_persecond();
+    const char *implementation = cpucycles_implementation();
+    const char *version = cpucycles_version();
+
+Link with `-lcpucycles`. Old systems may also need `-lrt`.
+
+### DESCRIPTION
+
+`cpucycles()` returns an estimate for the number of CPU cycles that have
+occurred since an unspecified time in the past (perhaps system boot,
+perhaps program startup).
+
+Accessing true cycle counters can be difficult on some CPUs and
+operating systems. `cpucycles()` does its best to produce accurate
+results, but selects a low-precision counter if the only other option is
+failure.
+
+`cpucycles_persecond()` returns an estimate for the number of CPU cycles
+per second. This estimate comes from `/etc/cpucyclespersecond` if that
+file exists, otherwise from various OS mechanisms, otherwise from the
+`cpucyclespersecond` environment variable if that is set, otherwise
+2399987654.
+
+`cpucycles_implementation()` returns the name of the counter in use:
+e.g., `"amd64-pmc"`.
+
+`cpucycles_version()` returns the `libcpucycles` version number as a
+string: e.g., `"20230115"`. Results of `cpucycles_implementation()`
+should be interpreted relative to `cpucycles_version()`.
+
+`cpucycles` is actually a function pointer. The first call to
+`cpucycles()` or `cpucycles_persecond()` or `cpucycles_implementation()`
+selects one of the available counters and updates the `cpucycles`
+pointer accordingly. Subsequent calls to `cpucycles()` are thread-safe.
+
+### SEE ALSO
+
+**gettimeofday**(2), **clock_gettime**(2)
diff --git a/cpu-cycles/libcpucycles/doc/counters.md b/cpu-cycles/libcpucycles/doc/counters.md
new file mode 100644
index 0000000000..db87f89cfa
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/counters.md
@@ -0,0 +1,447 @@
+Currently libcpucycles supports the following cycle counters. Some
+cycle counters are actually other forms of counters that libcpucycles
+scales to imitate a cycle counter. There is
+[separate documentation](selection.html)
+for how libcpucycles makes a choice of cycle counter. See also
+[security considerations](security.html) regarding enabling or disabling
+counters and regarding Turbo Boost.
+
+`amd64-pmc`: Requires a 64-bit Intel/AMD platform. Requires the Linux
+perf_event interface. Accesses a cycle counter through RDPMC. Requires
+`/proc/sys/kernel/perf_event_paranoid` to be at most 2 for user-level
+RDPMC access. This counter runs at the clock frequency of the CPU core.
+
+`amd64-tsc`, `amd64-tscasm`: Requires a 64-bit Intel/AMD platform.
+Requires RDTSC to be enabled, which it is by default. Uses RDTSC to
+access the CPU's time-stamp counter. On current CPUs, this is an
+off-core clock rather than a cycle counter, but it is typically a very
+fast off-core clock, making it adequate for seeing cycle counts if
+overclocking and underclocking are disabled. The difference between
+`tsc` and `tscasm` is that `tsc` uses the compiler's `__rdtsc()` while
+`tscasm` uses inline assembly.
+
+`arm32-cortex`: Requires a 32-bit ARMv7-A platform. Uses
+`mrc p15, 0, %0, c9, c13, 0` to read the cycle counter. Requires user
+access to the cycle counter, which is not enabled by default but can be
+enabled under Linux via
+[a kernel module](https://github.com/thoughtpolice/enable_arm_pmu).
+This counter is natively 32 bits, but libcpucycles watches how the
+counter and `gettimeofday` increase to compute a 64-bit extension of the
+counter.
+
+`arm64-pmc`: Requires a 64-bit ARMv8-A platform. Uses
+`mrs %0, PMCCNTR_EL0` to read the cycle counter. Requires user access
+to the cycle counter, which is not enabled by default but can be enabled
+under Linux via
+[a kernel module](https://github.com/rdolbeau/enable_arm_pmu).
+
+`arm64-vct`: Requires a 64-bit ARMv8-A platform. Uses
+`mrs %0, CNTVCT_EL0` to read a "virtual count" timer. This is an
+off-core clock, typically running at 24MHz. Results are scaled by
+libcpucycles.
+
+`mips64-cc`: Requires a 64-bit MIPS platform. (Maybe the same code would
+also work as `mips32-cc`, but this has not been tested yet.) Uses RDHWR
+to read the hardware cycle counter (hardware register 2 times a constant
+scale factor in hardware register 3). This counter is natively 32 bits,
+but libcpucycles watches how the counter and `gettimeofday` increase to
+compute a 64-bit extension of the counter.
+
+`ppc32-mftb`: Requires a 32-bit PowerPC platform. Uses `mftb` and
+`mftbu` to read the "time base". This is an off-core clock, typically
+running at 24MHz.
+
+`ppc64-mftb`: Requires a 64-bit PowerPC platform. Uses `mftb` and
+`mftbu` to read the "time base". This is an off-core clock, typically
+running at 24MHz.
+
+`riscv32-rdcycle`: Requires a 32-bit RISC-V platform. Uses `rdcycle`
+and `rdcycleh` to read a cycle counter.
+
+`riscv64-rdcycle`: Requires a 64-bit RISC-V platform. Uses `rdcycle`
+to read a cycle counter.
+
+`s390x-stckf`: Requires a 64-bit z/Architecture platform. Uses `stckf`
+to read the TOD clock, which is documented to run at 4096MHz. On the
+z15, this looks like a doubling of an off-core 2048MHz clock. Results
+are scaled by libcpucycles.
+
+`sparc64-rdtick`: Requires a 64-bit SPARC platform. Uses `rd %tick`
+to read a cycle counter.
+
+`x86-tsc`, `x86-tscasm`: Same as `amd64-tsc` and `amd64-tscasm`, but
+for 32-bit Intel/AMD platforms instead of 64-bit Intel/AMD platforms.
+
+`default-gettimeofday`: Reasonably portable. Resolution is limited to 1
+microsecond. Results are scaled by libcpucycles.
+
+`default-mach`: Requires an OS with `mach_absolute_time()`. Typically
+runs at 24MHz. Results are scaled by libcpucycles.
+
+`default-monotonic`: Requires `CLOCK_MONOTONIC`. Reasonably portable,
+although might fail on older systems where `default-gettimeofday` works.
+Resolution is limited to 1 nanosecond. Can be almost as good as a cycle
+counter, or orders of magnitude worse, depending on the OS and CPU.
+Results are scaled by libcpucycles.
+
+`default-perfevent`: Requires the Linux `perf_event` interface, and a
+CPU where `perf_event` supports `PERF_COUNT_HW_CPU_CYCLES`. Similar
+variations in quality to `default-monotonic`, without the 1-nanosecond
+limitation.
+
+`default-zero`: The horrifying last resort if nothing else works.
+
+## Examples
+
+These are examples of `cpucycles-info` output on various machines. The
+machines named `gcc*` are from the
+[GCC Compile Farm](https://gcc.gnu.org/wiki/CompileFarm).
+
+A `median` line saying, e.g., `47 +47+28+0+2-5+0+2-5...` means that the
+differences between adjacent cycle counts were 47+47, 47+28, 47+0, 47+2,
+47−5, 47+0, 47+2, 47−5, etc., with median difference 47. The first few
+differences are typically larger because of cache effects.
+
+`pi3aplus`,
+Broadcom BCM2837B0:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 arm64-pmc precision 9 scaling 1.000000 only32 0
+cpucycles tracesetup 1 arm64-vct precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 189 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 272 scaling 1.400000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 1600 scaling 1400.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1400000000
+cpucycles implementation arm64-pmc
+cpucycles median 10 +10+8+3+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 1032000000...4224666667 with 1024 loops 4 microseconds
+cpucycles observed persecond 1286000000...1756000000 with 2048 loops 7 microseconds
+cpucycles observed persecond 1368266666...1598000000 with 4096 loops 14 microseconds
+cpucycles observed persecond 1366700000...1473428572 with 8192 loops 29 microseconds
+cpucycles observed persecond 1366100000...1417534483 with 16384 loops 59 microseconds
+cpucycles observed persecond 1332739837...1357132232 with 32768 loops 122 microseconds
+cpucycles observed persecond 1354483471...1366945834 with 65536 loops 241 microseconds
+cpucycles observed persecond 1385684989...1392195330 with 131072 loops 472 microseconds
+cpucycles observed persecond 1347223021...1350328528 with 262144 loops 972 microseconds
+cpucycles observed persecond 1375460125...1377069853 with 524288 loops 1905 microseconds
+cpucycles observed persecond 1376527697...1377335961 with 1048576 loops 3808 microseconds
+```
+
+`bblack`,
+TI Sitara XAM3359AZCZ100:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 arm32-cortex precision 8 scaling 1.000000 only32 1
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 1283 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 1200 scaling 1000.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1000000000
+cpucycles implementation arm32-cortex
+cpucycles median 1260 +1506+62+31+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+13+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 622181818...2101888889 with 1024 loops 10 microseconds
+cpucycles observed persecond 806133333...1492615385 with 2048 loops 14 microseconds
+cpucycles observed persecond 879880000...1232565218 with 4096 loops 24 microseconds
+cpucycles observed persecond 939577777...1130581396 with 8192 loops 44 microseconds
+cpucycles observed persecond 956954022...1050047059 with 16384 loops 86 microseconds
+cpucycles observed persecond 982878542...1020685715 with 32768 loops 246 microseconds
+cpucycles observed persecond 988105105...1012217523 with 65536 loops 332 microseconds
+cpucycles observed persecond 993752077...1007159723 with 131072 loops 721 microseconds
+cpucycles observed persecond 995364296...1004009448 with 262144 loops 1377 microseconds
+cpucycles observed persecond 998216306...1001821536 with 524288 loops 2685 microseconds
+cpucycles observed persecond 998991848...1000914196 with 1048576 loops 5397 microseconds
+```
+
+`hiphop`,
+Intel Xeon E3-1220 v3:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 40 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 124 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 124 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 160 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 272 scaling 3.100000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 3300 scaling 3100.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3100000000
+cpucycles implementation amd64-pmc
+cpucycles median 44 +38+23+23+23-4+0-4+0-4+0-4+0+10-4-2+1-4+1-4+1+17+1-4+1-4+1-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4
+cpucycles observed persecond 2066500000...4235000000 with 8192 loops 3 microseconds
+cpucycles observed persecond 2760833333...4200250000 with 16384 loops 5 microseconds
+cpucycles observed persecond 2743416666...3313100000 with 32768 loops 11 microseconds
+cpucycles observed persecond 2986227272...3295000000 with 65536 loops 21 microseconds
+cpucycles observed persecond 3052069767...3206073171 with 131072 loops 42 microseconds
+cpucycles observed persecond 3050395348...3125523810 with 262144 loops 85 microseconds
+cpucycles observed persecond 3085123529...3123059524 with 524288 loops 169 microseconds
+cpucycles observed persecond 3084561764...3103434912 with 1048576 loops 339 microseconds
+```
+
+`nucnuc`,
+Intel Pentium N3700:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 26 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 120 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 120 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 427 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 320 scaling 1.600000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 1800 scaling 1600.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1600000000
+cpucycles implementation amd64-pmc
+cpucycles median 66 +12+12+14+14-1-1+0-1+0-1+0-1+0+1-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+1-1+0-2-1-1+0-1+0-1+0-2+0-1+2+0-1+0-1+0+0-1
+cpucycles observed persecond 1060500000...2325000000 with 2048 loops 3 microseconds
+cpucycles observed persecond 1387166666...2208250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 1376083333...1705500000 with 8192 loops 11 microseconds
+cpucycles observed persecond 1495727272...1671800000 with 16384 loops 21 microseconds
+cpucycles observed persecond 1563428571...1655100000 with 32768 loops 41 microseconds
+cpucycles observed persecond 1580807228...1626234568 with 65536 loops 82 microseconds
+cpucycles observed persecond 1589539393...1612619632 with 131072 loops 164 microseconds
+cpucycles observed persecond 1598841463...1610230062 with 262144 loops 327 microseconds
+cpucycles observed persecond 1564336810...1569988042 with 524288 loops 670 microseconds
+cpucycles observed persecond 1599759725...1602608098 with 1048576 loops 1310 microseconds
+```
+
+`saber214`,
+AMD FX-8350:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 167 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 168 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 376 scaling 4.013452 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 4213 scaling 4013.452000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 4013452000
+cpucycles implementation amd64-tsc
+cpucycles median 77 +87-2+21+7+4+1+0+2-2-7-4+0+1+4-2+3+1-2-2+5-6+2+2+2+2+1-1-1+0-4+0-1-1-1-2+3-1-1+2-2+0+0+2+0+0+2-2-2+1-1-2+2-5+2+0+2+0+1+0+3-2-1-1
+cpucycles observed persecond 2767500000...5759000000 with 4096 loops 3 microseconds
+cpucycles observed persecond 3426000000...4893800000 with 8192 loops 6 microseconds
+cpucycles observed persecond 3724076923...4446363637 with 16384 loops 12 microseconds
+cpucycles observed persecond 3977833333...4363318182 with 32768 loops 23 microseconds
+cpucycles observed persecond 3984854166...4168739131 with 65536 loops 47 microseconds
+cpucycles observed persecond 3981709923...4048193799 with 131072 loops 130 microseconds
+cpucycles observed persecond 3982716417...4026914573 with 262144 loops 200 microseconds
+cpucycles observed persecond 4001637602...4025136987 with 524288 loops 366 microseconds
+cpucycles observed persecond 4007411111...4018600248 with 1048576 loops 809 microseconds
+```
+
+`gcc14`,
+Intel Xeon E5-2620 v3,
+Debian testing (bookworm),
+Linux kernel 6.0.0-6-amd64:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 41 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 148 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 148 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 159 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 289 scaling 3.200000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 3400 scaling 3200.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3200000000
+cpucycles implementation amd64-pmc
+cpucycles median 47 +47+28+0+2-5+0+2-5+16+2-5+0+2-5+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0
+cpucycles observed persecond 1653800000...2819333334 with 8192 loops 4 microseconds
+cpucycles observed persecond 1832111111...2389285715 with 16384 loops 8 microseconds
+cpucycles observed persecond 1936058823...2207200000 with 32768 loops 16 microseconds
+cpucycles observed persecond 2052843750...2196200000 with 65536 loops 31 microseconds
+cpucycles observed persecond 2050750000...2120048388 with 131072 loops 63 microseconds
+cpucycles observed persecond 2081896825...2117048388 with 262144 loops 125 microseconds
+cpucycles observed persecond 2089478087...2107044177 with 524288 loops 250 microseconds
+cpucycles observed persecond 2093343313...2102124249 with 1048576 loops 500 microseconds
+```
+
+`gcc23`,
+Cavium Octeon II V0.1,
+Debian 8.11,
+Linux kernel 4.1.4:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 mips64-cc precision 24 scaling 1.000000 only32 1
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 46702 scaling 2.399988 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 45799 scaling 2399.987654 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2399987654
+cpucycles implementation mips64-cc
+cpucycles median 2177 +828+17+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 641900000...1845125000 with 1024 loops 9 microseconds
+cpucycles observed persecond 745357142...1352083334 with 2048 loops 13 microseconds
+cpucycles observed persecond 809826086...1162333334 with 4096 loops 22 microseconds
+cpucycles observed persecond 897717948...1104405406 with 8192 loops 38 microseconds
+cpucycles observed persecond 957467532...1059986667 with 16384 loops 76 microseconds
+cpucycles observed persecond 973102189...1029777778 with 32768 loops 136 microseconds
+cpucycles observed persecond 986518656...1015830828 with 65536 loops 267 microseconds
+cpucycles observed persecond 993452830...1008166667 with 131072 loops 529 microseconds
+cpucycles observed persecond 996036966...1003403609 with 262144 loops 1054 microseconds
+cpucycles observed persecond 984706378...1001682630 with 524288 loops 2131 microseconds
+cpucycles observed persecond 992585292...1001178580 with 1048576 loops 4296 microseconds
+```
+
+`gcc45`,
+AMD Athlon II X4 640,
+Debian 8.11,
+Linux kernel 3.16.0-11-686-pae:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 x86-tsc precision 199 scaling 1.000000 only32 0
+cpucycles tracesetup 1 x86-tscasm precision 199 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 170 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 941 scaling 3.000000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 3200 scaling 3000.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3000000000
+cpucycles implementation default-perfevent
+cpucycles median 72 +12+0+0+0+0+0+0+0+5+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0
+cpucycles observed persecond 541500000...1812000000 with 1024 loops 3 microseconds
+cpucycles observed persecond 712333333...1212250000 with 2048 loops 5 microseconds
+cpucycles observed persecond 1193285714...1733600000 with 4096 loops 6 microseconds
+cpucycles observed persecond 1689176470...1804562500 with 8192 loops 33 microseconds
+cpucycles observed persecond 1713074626...1770600000 with 16384 loops 66 microseconds
+cpucycles observed persecond 1765107692...1795140625 with 32768 loops 129 microseconds
+cpucycles observed persecond 1785369649...1800603922 with 65536 loops 256 microseconds
+cpucycles observed persecond 1781377862...1796288462 with 131072 loops 261 microseconds
+cpucycles observed persecond 1772647398...1778247827 with 262144 loops 691 microseconds
+cpucycles observed persecond 1789670493...1794149598 with 524288 loops 870 microseconds
+cpucycles observed persecond 1860276211...1861561332 with 1048576 loops 3156 microseconds
+```
+
+`gcc92`,
+SiFive Freedom U740,
+Ubuntu 22.04,
+Linux kernel 5.15.0-1014-generic:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 riscv64-rdcycle precision 8 scaling 1.000000 only32 0
+cpucycles tracesetup 1 default-perfevent precision 3024 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 2599 scaling 2.399988 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 2599 scaling 2399.987654 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2399987654
+cpucycles implementation riscv64-rdcycle
+cpucycles median 8 +33+27+1+1+1+1+0+0+0+22+0+0+0+0+0+0+0+628+0+0+0+7+0+0+0+145+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+158+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+22+0+0+0+0+0
+cpucycles observed persecond 530250000...1978000000 with 1024 loops 3 microseconds
+cpucycles observed persecond 831000000...1915666667 with 2048 loops 4 microseconds
+cpucycles observed persecond 1055750000...1689500000 with 4096 loops 7 microseconds
+cpucycles observed persecond 1045562500...1305428572 with 8192 loops 15 microseconds
+cpucycles observed persecond 1102700000...1236357143 with 16384 loops 29 microseconds
+cpucycles observed persecond 1176053571...1247444445 with 32768 loops 55 microseconds
+cpucycles observed persecond 1173321428...1209127273 with 65536 loops 111 microseconds
+cpucycles observed persecond 1187805429...1205210046 with 131072 loops 220 microseconds
+cpucycles observed persecond 1192415909...1201157535 with 262144 loops 439 microseconds
+cpucycles observed persecond 1194694760...1199247717 with 524288 loops 877 microseconds
+cpucycles observed persecond 1194656004...1197023034 with 1048576 loops 1781 microseconds
+```
+
+`gcc103`,
+Apple M1 (Icestorm-M1 + Firestorm-M1),
+Debian unstable (bookworm),
+Linux kernel 6.0.0-rc5-asahi-00001-gc62bd3fe430f:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 arm64-pmc precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 1 arm64-vct precision 186 scaling 86.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 285 scaling 2.064000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 2264 scaling 2064.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2064000000
+cpucycles implementation arm64-vct
+cpucycles median 0 +0+86+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0
+cpucycles observed persecond 1784500000...3655000000 with 8192 loops 3 microseconds
+cpucycles observed persecond 1773750000...2393666667 with 16384 loops 7 microseconds
+cpucycles observed persecond 1897733333...2222769231 with 32768 loops 14 microseconds
+cpucycles observed persecond 1951310344...2114962963 with 65536 loops 28 microseconds
+cpucycles observed persecond 2024071428...2107000000 with 131072 loops 55 microseconds
+cpucycles observed persecond 2041531531...2082935780 with 262144 loops 110 microseconds
+cpucycles observed persecond 2051158371...2071461188 with 524288 loops 220 microseconds
+cpucycles observed persecond 2058539682...2068309795 with 1048576 loops 440 microseconds
+```
+
+`gcc112` (`gcc2-power8`),
+IBM POWER8E,
+CentOS 7.9 AltArch,
+Linux kernel 3.10.0-1127.13.1.el7.ppc64le:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 ppc64-mftb precision 251 scaling 7.207031 only32 0
+cpucycles tracesetup 1 default-perfevent precision 295 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 536 scaling 3.690000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 3890 scaling 3690.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3690000000
+cpucycles implementation ppc64-mftb
+cpucycles median 195 +2969-8+14+0-8+7-8-7+7+6-7-1+0-1+0+7+7-15+7-1-7+6+0+0-8+0+6+0-8+7+0+7-8-8-7-1+7-8+7+0-8+0+14-8-7+6+0-8+7+7-15+0-1+0-1+14+0-15+14+0-1+7+0
+cpucycles observed persecond 2603750000...5510000000 with 2048 loops 3 microseconds
+cpucycles observed persecond 3430500000...6052250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 3411333333...4457500000 with 8192 loops 11 microseconds
+cpucycles observed persecond 3548695652...4060333334 with 16384 loops 22 microseconds
+cpucycles observed persecond 3624977777...3876534884 with 32768 loops 44 microseconds
+cpucycles observed persecond 3621855555...3745363637 with 65536 loops 89 microseconds
+cpucycles observed persecond 3660157303...3722227273 with 131072 loops 177 microseconds
+cpucycles observed persecond 3680471751...3711622160 with 262144 loops 353 microseconds
+cpucycles observed persecond 3685321074...3700886525 with 524288 loops 706 microseconds
+cpucycles observed persecond 3687745930...3695537208 with 1048576 loops 1412 microseconds
+```
+
+`gcc202`,
+UltraSparc T5,
+Debian unstable (bookworm),
+Linux kernel 5.19.0-2-sparc64-smp:
+```
+cpucycles version 20230105
+cpucycles tracesetup 0 sparc64-rdtick precision 65 scaling 1.000000 only32 0
+cpucycles tracesetup 1 default-perfevent precision 386 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 442 scaling 3.599910 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 3799 scaling 3599.910000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3599910000
+cpucycles implementation sparc64-rdtick
+cpucycles median 73 +24+0+24+24+24+24+24+24+0+1+24+0+1+24+0+1+24+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 2751500000...4258250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 3289200000...4206875000 with 8192 loops 9 microseconds
+cpucycles observed persecond 3454789473...3900823530 with 16384 loops 18 microseconds
+cpucycles observed persecond 3452026315...3659888889 with 32768 loops 37 microseconds
+cpucycles observed persecond 3543770270...3650916667 with 65536 loops 73 microseconds
+cpucycles observed persecond 3567299319...3620662069 with 131072 loops 146 microseconds
+cpucycles observed persecond 3591373287...3618220690 with 262144 loops 291 microseconds
+cpucycles observed persecond 3597353344...3610774527 with 524288 loops 582 microseconds
+cpucycles observed persecond 3595899403...3603058071 with 1048576 loops 1172 microseconds
+```
+
+IBM z15:
+```
+cpucycles version 20230106
+cpucycles tracesetup 0 s390x-stckf precision 250 scaling 1.269531 only32 0
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 272 scaling 5.200000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 5400 scaling 5200.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 5200000000
+cpucycles implementation s390x-stckf
+cpucycles median 48 +87+8+0-2+0+0+38-2+0+1-3+1+28+0+3-3+1+0+28+0-2+3+0-2+36+0+0+0+1+0+28+0-2+0+3-2+35+1+0-2+0+3+28+0-2+0+0-2+3+25+3+0-2+0+1+35+1+0+0-2+0+28+0
+cpucycles observed persecond 4948941176...5627733334 with 8192 loops 16 microseconds
+cpucycles observed persecond 4104125000...5515666667 with 16384 loops 7 microseconds
+cpucycles observed persecond 5047076923...5987818182 with 32768 loops 12 microseconds
+cpucycles observed persecond 5044846153...5475708334 with 65536 loops 25 microseconds
+cpucycles observed persecond 5141313725...5357428572 with 131072 loops 50 microseconds
+cpucycles observed persecond 5150892156...5257250000 with 262144 loops 101 microseconds
+cpucycles observed persecond 5183421568...5236549505 with 524288 loops 203 microseconds
+cpucycles observed persecond 5190282555...5216582717 with 1048576 loops 406 microseconds
+```
diff --git a/cpu-cycles/libcpucycles/doc/download.md b/cpu-cycles/libcpucycles/doc/download.md
new file mode 100644
index 0000000000..6f72ddb220
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/download.md
@@ -0,0 +1,30 @@
+To download and unpack the latest version of libcpucycles:
+
+        wget -m https://cpucycles.cr.yp.to/libcpucycles-latest-version.txt
+        version=$(cat cpucycles.cr.yp.to/libcpucycles-latest-version.txt)
+        wget -m https://cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
+        tar -xzf cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
+        cd libcpucycles-$version
+
+Then [install](install.html).
+
+### Archives and changelog (reverse chronological)
+
+[`libcpucycles-20230115.tar.gz`](libcpucycles-20230115.tar.gz) [browse](libcpucycles-20230115.html)
+
+Update actual `cpucycles_version` behavior to match documentation.
+
+[`libcpucycles-20230110.tar.gz`](libcpucycles-20230110.tar.gz) [browse](libcpucycles-20230110.html)
+
+`doc/api.md`: Document `cpucycles_version()`.
+
+Add `s390x-stckf` counter.
+
+`cpucycles/default-perfevent.c`: Read into `int64_t` instead of `long long`.
+Add comment explaining issues with `PERF_FORMAT_TOTAL_TIME_RUNNING`.
+
+`configure`: Improve `uname` handling.
+
+`doc/api.md`: Update description of default frequency.
+
+[`libcpucycles-20230105.tar.gz`](libcpucycles-20230105.tar.gz) [browse](libcpucycles-20230105.html)
diff --git a/cpu-cycles/libcpucycles/doc/html/api.html b/cpu-cycles/libcpucycles/doc/html/api.html
new file mode 100644
index 0000000000..1547c19647
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/api.html
@@ -0,0 +1,91 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<style type="text/css">
+html{overflow-y:scroll}
+body{font-family:sans-serif}
+p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
+li p{font-size:1.0em}
+blockquote p{font-size:1.0em}
+tt{font-size:1.2em}
+code{font-size:1.2em}
+h1{font-size:1.5em}
+h2{font-size:1.3em}
+h3{font-size:1.0em}
+h1 a{text-decoration:none}
+table{border-collapse:collapse}
+th,td{border:1px solid black}
+table a{text-decoration:none}
+table tr{font-size:0.9em;line-height:1.6em}
+.links a:hover{text-decoration:underline}
+.links a:active{text-decoration:underline}
+.links img{width:200px;padding-left:1em}
+.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
+.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
+.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
+min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
+font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
+.here{border-bottom:0px;background-color:#ffffff}
+.away{background-color:#125d0d;}
+.away a{text-decoration:none;display:block;color:#ffffff}
+.away a:hover,.away a:active{text-decoration:underline}
+.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
+</style>
+<title>
+API</title>
+</head>
+<body>
+<div class=headline>
+libcpucycles
+</div>
+<div class=nav>
+<div class="navt away"><a href=index.html>Intro</a>
+</div><div class="navt away"><a href=download.html>Download</a>
+</div><div class="navt away"><a href=install.html>Install</a>
+</div><div class="navt here">API
+</div><div class="navt away"><a href=counters.html>Counters</a>
+</div><div class="navt away"><a href=selection.html>Selection</a>
+</div><div class="navt away"><a href=security.html>Security</a>
+</div></div>
+<div class=main>
+<h3>NAME</h3>
+<p>cpucycles - count CPU cycles</p>
+<h3>SYNOPSIS</h3>
+<pre><code>#include &lt;cpucycles.h&gt;
+
+long long count = cpucycles();
+long long persecond = cpucycles_persecond();
+const char *implementation = cpucycles_implementation();
+const char *version = cpucycles_version();
+</code></pre>
+<p>Link with <code>-lcpucycles</code>. Old systems may also need <code>-lrt</code>.</p>
+<h3>DESCRIPTION</h3>
+<p><code>cpucycles()</code> returns an estimate for the number of CPU cycles that have
+occurred since an unspecified time in the past (perhaps system boot,
+perhaps program startup).</p>
+<p>Accessing true cycle counters can be difficult on some CPUs and
+operating systems. <code>cpucycles()</code> does its best to produce accurate
+results, but selects a low-precision counter if the only other option is
+failure.</p>
+<p><code>cpucycles_persecond()</code> returns an estimate for the number of CPU cycles
+per second. This estimate comes from <code>/etc/cpucyclespersecond</code> if that
+file exists, otherwise from various OS mechanisms, otherwise from the
+<code>cpucyclespersecond</code> environment variable if that is set, otherwise
+2399987654.</p>
+<p><code>cpucycles_implementation()</code> returns the name of the counter in use:
+e.g., <code>"amd64-pmc"</code>.</p>
+<p><code>cpucycles_version()</code> returns the <code>libcpucycles</code> version number as a
+string: e.g., <code>"20230115"</code>. Results of <code>cpucycles_implementation()</code>
+should be interpreted relative to <code>cpucycles_version()</code>.</p>
+<p><code>cpucycles</code> is actually a function pointer. The first call to
+<code>cpucycles()</code> or <code>cpucycles_persecond()</code> or <code>cpucycles_implementation()</code>
+selects one of the available counters and updates the <code>cpucycles</code>
+pointer accordingly. Subsequent calls to <code>cpucycles()</code> are thread-safe.</p>
+<h3>SEE ALSO</h3>
+<p><strong>gettimeofday</strong>(2), <strong>clock_gettime</strong>(2)</p><hr><font size=1><b>Version:</b>
+This is version 2023.01.15 of the "API" web page.
+</font>
+</div>
+</body>
+</html>
diff --git a/cpu-cycles/libcpucycles/doc/html/counters.html b/cpu-cycles/libcpucycles/doc/html/counters.html
new file mode 100644
index 0000000000..6ebc06222c
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/counters.html
@@ -0,0 +1,456 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<style type="text/css">
+html{overflow-y:scroll}
+body{font-family:sans-serif}
+p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
+li p{font-size:1.0em}
+blockquote p{font-size:1.0em}
+tt{font-size:1.2em}
+code{font-size:1.2em}
+h1{font-size:1.5em}
+h2{font-size:1.3em}
+h3{font-size:1.0em}
+h1 a{text-decoration:none}
+table{border-collapse:collapse}
+th,td{border:1px solid black}
+table a{text-decoration:none}
+table tr{font-size:0.9em;line-height:1.6em}
+.links a:hover{text-decoration:underline}
+.links a:active{text-decoration:underline}
+.links img{width:200px;padding-left:1em}
+.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
+.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
+.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
+min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
+font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
+.here{border-bottom:0px;background-color:#ffffff}
+.away{background-color:#125d0d;}
+.away a{text-decoration:none;display:block;color:#ffffff}
+.away a:hover,.away a:active{text-decoration:underline}
+.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
+</style>
+<title>
+Counters</title>
+</head>
+<body>
+<div class=headline>
+libcpucycles
+</div>
+<div class=nav>
+<div class="navt away"><a href=index.html>Intro</a>
+</div><div class="navt away"><a href=download.html>Download</a>
+</div><div class="navt away"><a href=install.html>Install</a>
+</div><div class="navt away"><a href=api.html>API</a>
+</div><div class="navt here">Counters
+</div><div class="navt away"><a href=selection.html>Selection</a>
+</div><div class="navt away"><a href=security.html>Security</a>
+</div></div>
+<div class=main>
+<p>Currently libcpucycles supports the following cycle counters. Some
+cycle counters are actually other forms of counters that libcpucycles
+scales to imitate a cycle counter. There is
+<a href="selection.html">separate documentation</a>
+for how libcpucycles makes a choice of cycle counter. See also
+<a href="security.html">security considerations</a> regarding enabling or disabling
+counters and regarding Turbo Boost.</p>
+<p><code>amd64-pmc</code>: Requires a 64-bit Intel/AMD platform. Requires the Linux
+perf_event interface. Accesses a cycle counter through RDPMC. Requires
+<code>/proc/sys/kernel/perf_event_paranoid</code> to be at most 2 for user-level
+RDPMC access. This counter runs at the clock frequency of the CPU core.</p>
+<p><code>amd64-tsc</code>, <code>amd64-tscasm</code>: Requires a 64-bit Intel/AMD platform.
+Requires RDTSC to be enabled, which it is by default. Uses RDTSC to
+access the CPU's time-stamp counter. On current CPUs, this is an
+off-core clock rather than a cycle counter, but it is typically a very
+fast off-core clock, making it adequate for seeing cycle counts if
+overclocking and underclocking are disabled. The difference between
+<code>tsc</code> and <code>tscasm</code> is that <code>tsc</code> uses the compiler's <code>__rdtsc()</code> while
+<code>tscasm</code> uses inline assembly.</p>
+<p><code>arm32-cortex</code>: Requires a 32-bit ARMv7-A platform. Uses
+<code>mrc p15, 0, %0, c9, c13, 0</code> to read the cycle counter. Requires user
+access to the cycle counter, which is not enabled by default but can be
+enabled under Linux via
+<a href="https://github.com/thoughtpolice/enable_arm_pmu">a kernel module</a>.
+This counter is natively 32 bits, but libcpucycles watches how the
+counter and <code>gettimeofday</code> increase to compute a 64-bit extension of the
+counter.</p>
+<p><code>arm64-pmc</code>: Requires a 64-bit ARMv8-A platform. Uses
+<code>mrs %0, PMCCNTR_EL0</code> to read the cycle counter. Requires user access
+to the cycle counter, which is not enabled by default but can be enabled
+under Linux via
+<a href="https://github.com/rdolbeau/enable_arm_pmu">a kernel module</a>.</p>
+<p><code>arm64-vct</code>: Requires a 64-bit ARMv8-A platform. Uses
+<code>mrs %0, CNTVCT_EL0</code> to read a "virtual count" timer. This is an
+off-core clock, typically running at 24MHz. Results are scaled by
+libcpucycles.</p>
+<p><code>mips64-cc</code>: Requires a 64-bit MIPS platform. (Maybe the same code would
+also work as <code>mips32-cc</code>, but this has not been tested yet.) Uses RDHWR
+to read the hardware cycle counter (hardware register 2 times a constant
+scale factor in hardware register 3). This counter is natively 32 bits,
+but libcpucycles watches how the counter and <code>gettimeofday</code> increase to
+compute a 64-bit extension of the counter.</p>
+<p><code>ppc32-mftb</code>: Requires a 32-bit PowerPC platform. Uses <code>mftb</code> and
+<code>mftbu</code> to read the "time base". This is an off-core clock, typically
+running at 24MHz.</p>
+<p><code>ppc64-mftb</code>: Requires a 64-bit PowerPC platform. Uses <code>mftb</code> and
+<code>mftbu</code> to read the "time base". This is an off-core clock, typically
+running at 24MHz.</p>
+<p><code>riscv32-rdcycle</code>: Requires a 32-bit RISC-V platform. Uses <code>rdcycle</code>
+and <code>rdcycleh</code> to read a cycle counter.</p>
+<p><code>riscv64-rdcycle</code>: Requires a 64-bit RISC-V platform. Uses <code>rdcycle</code>
+to read a cycle counter.</p>
+<p><code>s390x-stckf</code>: Requires a 64-bit z/Architecture platform. Uses <code>stckf</code>
+to read the TOD clock, which is documented to run at 4096MHz. On the
+z15, this looks like a doubling of an off-core 2048MHz clock. Results
+are scaled by libcpucycles.</p>
+<p><code>sparc64-rdtick</code>: Requires a 64-bit SPARC platform. Uses <code>rd %tick</code>
+to read a cycle counter.</p>
+<p><code>x86-tsc</code>, <code>x86-tscasm</code>: Same as <code>amd64-tsc</code> and <code>amd64-tscasm</code>, but
+for 32-bit Intel/AMD platforms instead of 64-bit Intel/AMD platforms.</p>
+<p><code>default-gettimeofday</code>: Reasonably portable. Resolution is limited to 1
+microsecond. Results are scaled by libcpucycles.</p>
+<p><code>default-mach</code>: Requires an OS with <code>mach_absolute_time()</code>. Typically
+runs at 24MHz. Results are scaled by libcpucycles.</p>
+<p><code>default-monotonic</code>: Requires <code>CLOCK_MONOTONIC</code>. Reasonably portable,
+although might fail on older systems where <code>default-gettimeofday</code> works.
+Resolution is limited to 1 nanosecond. Can be almost as good as a cycle
+counter, or orders of magnitude worse, depending on the OS and CPU.
+Results are scaled by libcpucycles.</p>
+<p><code>default-perfevent</code>: Requires the Linux <code>perf_event</code> interface, and a
+CPU where <code>perf_event</code> supports <code>PERF_COUNT_HW_CPU_CYCLES</code>. Similar
+variations in quality to <code>default-monotonic</code>, without the 1-nanosecond
+limitation.</p>
+<p><code>default-zero</code>: The horrifying last resort if nothing else works.</p>
+<h2>Examples</h2>
+<p>These are examples of <code>cpucycles-info</code> output on various machines. The
+machines named <code>gcc*</code> are from the
+<a href="https://gcc.gnu.org/wiki/CompileFarm">GCC Compile Farm</a>.</p>
+<p>A <code>median</code> line saying, e.g., <code>47 +47+28+0+2-5+0+2-5...</code> means that the
+differences between adjacent cycle counts were 47+47, 47+28, 47+0, 47+2,
+47−5, 47+0, 47+2, 47−5, etc., with median difference 47. The first few
+differences are typically larger because of cache effects.</p>
+<p><code>pi3aplus</code>,
+Broadcom BCM2837B0:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 arm64-pmc precision 9 scaling 1.000000 only32 0
+cpucycles tracesetup 1 arm64-vct precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 189 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 272 scaling 1.400000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 1600 scaling 1400.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1400000000
+cpucycles implementation arm64-pmc
+cpucycles median 10 +10+8+3+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 1032000000...4224666667 with 1024 loops 4 microseconds
+cpucycles observed persecond 1286000000...1756000000 with 2048 loops 7 microseconds
+cpucycles observed persecond 1368266666...1598000000 with 4096 loops 14 microseconds
+cpucycles observed persecond 1366700000...1473428572 with 8192 loops 29 microseconds
+cpucycles observed persecond 1366100000...1417534483 with 16384 loops 59 microseconds
+cpucycles observed persecond 1332739837...1357132232 with 32768 loops 122 microseconds
+cpucycles observed persecond 1354483471...1366945834 with 65536 loops 241 microseconds
+cpucycles observed persecond 1385684989...1392195330 with 131072 loops 472 microseconds
+cpucycles observed persecond 1347223021...1350328528 with 262144 loops 972 microseconds
+cpucycles observed persecond 1375460125...1377069853 with 524288 loops 1905 microseconds
+cpucycles observed persecond 1376527697...1377335961 with 1048576 loops 3808 microseconds
+</code></pre>
+<p><code>bblack</code>,
+TI Sitara XAM3359AZCZ100:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 arm32-cortex precision 8 scaling 1.000000 only32 1
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 1283 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 1200 scaling 1000.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1000000000
+cpucycles implementation arm32-cortex
+cpucycles median 1260 +1506+62+31+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+13+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 622181818...2101888889 with 1024 loops 10 microseconds
+cpucycles observed persecond 806133333...1492615385 with 2048 loops 14 microseconds
+cpucycles observed persecond 879880000...1232565218 with 4096 loops 24 microseconds
+cpucycles observed persecond 939577777...1130581396 with 8192 loops 44 microseconds
+cpucycles observed persecond 956954022...1050047059 with 16384 loops 86 microseconds
+cpucycles observed persecond 982878542...1020685715 with 32768 loops 246 microseconds
+cpucycles observed persecond 988105105...1012217523 with 65536 loops 332 microseconds
+cpucycles observed persecond 993752077...1007159723 with 131072 loops 721 microseconds
+cpucycles observed persecond 995364296...1004009448 with 262144 loops 1377 microseconds
+cpucycles observed persecond 998216306...1001821536 with 524288 loops 2685 microseconds
+cpucycles observed persecond 998991848...1000914196 with 1048576 loops 5397 microseconds
+</code></pre>
+<p><code>hiphop</code>,
+Intel Xeon E3-1220 v3:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 40 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 124 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 124 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 160 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 272 scaling 3.100000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 3300 scaling 3100.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3100000000
+cpucycles implementation amd64-pmc
+cpucycles median 44 +38+23+23+23-4+0-4+0-4+0-4+0+10-4-2+1-4+1-4+1+17+1-4+1-4+1-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4
+cpucycles observed persecond 2066500000...4235000000 with 8192 loops 3 microseconds
+cpucycles observed persecond 2760833333...4200250000 with 16384 loops 5 microseconds
+cpucycles observed persecond 2743416666...3313100000 with 32768 loops 11 microseconds
+cpucycles observed persecond 2986227272...3295000000 with 65536 loops 21 microseconds
+cpucycles observed persecond 3052069767...3206073171 with 131072 loops 42 microseconds
+cpucycles observed persecond 3050395348...3125523810 with 262144 loops 85 microseconds
+cpucycles observed persecond 3085123529...3123059524 with 524288 loops 169 microseconds
+cpucycles observed persecond 3084561764...3103434912 with 1048576 loops 339 microseconds
+</code></pre>
+<p><code>nucnuc</code>,
+Intel Pentium N3700:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 26 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 120 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 120 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 427 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 320 scaling 1.600000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 1800 scaling 1600.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 1600000000
+cpucycles implementation amd64-pmc
+cpucycles median 66 +12+12+14+14-1-1+0-1+0-1+0-1+0+1-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+1-1+0-2-1-1+0-1+0-1+0-2+0-1+2+0-1+0-1+0+0-1
+cpucycles observed persecond 1060500000...2325000000 with 2048 loops 3 microseconds
+cpucycles observed persecond 1387166666...2208250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 1376083333...1705500000 with 8192 loops 11 microseconds
+cpucycles observed persecond 1495727272...1671800000 with 16384 loops 21 microseconds
+cpucycles observed persecond 1563428571...1655100000 with 32768 loops 41 microseconds
+cpucycles observed persecond 1580807228...1626234568 with 65536 loops 82 microseconds
+cpucycles observed persecond 1589539393...1612619632 with 131072 loops 164 microseconds
+cpucycles observed persecond 1598841463...1610230062 with 262144 loops 327 microseconds
+cpucycles observed persecond 1564336810...1569988042 with 524288 loops 670 microseconds
+cpucycles observed persecond 1599759725...1602608098 with 1048576 loops 1310 microseconds
+</code></pre>
+<p><code>saber214</code>,
+AMD FX-8350:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 167 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 168 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 376 scaling 4.013452 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 4213 scaling 4013.452000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 4013452000
+cpucycles implementation amd64-tsc
+cpucycles median 77 +87-2+21+7+4+1+0+2-2-7-4+0+1+4-2+3+1-2-2+5-6+2+2+2+2+1-1-1+0-4+0-1-1-1-2+3-1-1+2-2+0+0+2+0+0+2-2-2+1-1-2+2-5+2+0+2+0+1+0+3-2-1-1
+cpucycles observed persecond 2767500000...5759000000 with 4096 loops 3 microseconds
+cpucycles observed persecond 3426000000...4893800000 with 8192 loops 6 microseconds
+cpucycles observed persecond 3724076923...4446363637 with 16384 loops 12 microseconds
+cpucycles observed persecond 3977833333...4363318182 with 32768 loops 23 microseconds
+cpucycles observed persecond 3984854166...4168739131 with 65536 loops 47 microseconds
+cpucycles observed persecond 3981709923...4048193799 with 131072 loops 130 microseconds
+cpucycles observed persecond 3982716417...4026914573 with 262144 loops 200 microseconds
+cpucycles observed persecond 4001637602...4025136987 with 524288 loops 366 microseconds
+cpucycles observed persecond 4007411111...4018600248 with 1048576 loops 809 microseconds
+</code></pre>
+<p><code>gcc14</code>,
+Intel Xeon E5-2620 v3,
+Debian testing (bookworm),
+Linux kernel 6.0.0-6-amd64:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 amd64-pmc precision 41 scaling 1.000000 only32 0
+cpucycles tracesetup 1 amd64-tsc precision 148 scaling 1.000000 only32 0
+cpucycles tracesetup 2 amd64-tscasm precision 148 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-perfevent precision 159 scaling 1.000000 only32 0
+cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 5 default-monotonic precision 289 scaling 3.200000 only32 0
+cpucycles tracesetup 6 default-gettimeofday precision 3400 scaling 3200.000000 only32 0
+cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3200000000
+cpucycles implementation amd64-pmc
+cpucycles median 47 +47+28+0+2-5+0+2-5+16+2-5+0+2-5+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0
+cpucycles observed persecond 1653800000...2819333334 with 8192 loops 4 microseconds
+cpucycles observed persecond 1832111111...2389285715 with 16384 loops 8 microseconds
+cpucycles observed persecond 1936058823...2207200000 with 32768 loops 16 microseconds
+cpucycles observed persecond 2052843750...2196200000 with 65536 loops 31 microseconds
+cpucycles observed persecond 2050750000...2120048388 with 131072 loops 63 microseconds
+cpucycles observed persecond 2081896825...2117048388 with 262144 loops 125 microseconds
+cpucycles observed persecond 2089478087...2107044177 with 524288 loops 250 microseconds
+cpucycles observed persecond 2093343313...2102124249 with 1048576 loops 500 microseconds
+</code></pre>
+<p><code>gcc23</code>,
+Cavium Octeon II V0.1,
+Debian 8.11,
+Linux kernel 4.1.4:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 mips64-cc precision 24 scaling 1.000000 only32 1
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 46702 scaling 2.399988 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 45799 scaling 2399.987654 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2399987654
+cpucycles implementation mips64-cc
+cpucycles median 2177 +828+17+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 641900000...1845125000 with 1024 loops 9 microseconds
+cpucycles observed persecond 745357142...1352083334 with 2048 loops 13 microseconds
+cpucycles observed persecond 809826086...1162333334 with 4096 loops 22 microseconds
+cpucycles observed persecond 897717948...1104405406 with 8192 loops 38 microseconds
+cpucycles observed persecond 957467532...1059986667 with 16384 loops 76 microseconds
+cpucycles observed persecond 973102189...1029777778 with 32768 loops 136 microseconds
+cpucycles observed persecond 986518656...1015830828 with 65536 loops 267 microseconds
+cpucycles observed persecond 993452830...1008166667 with 131072 loops 529 microseconds
+cpucycles observed persecond 996036966...1003403609 with 262144 loops 1054 microseconds
+cpucycles observed persecond 984706378...1001682630 with 524288 loops 2131 microseconds
+cpucycles observed persecond 992585292...1001178580 with 1048576 loops 4296 microseconds
+</code></pre>
+<p><code>gcc45</code>,
+AMD Athlon II X4 640,
+Debian 8.11,
+Linux kernel 3.16.0-11-686-pae:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 x86-tsc precision 199 scaling 1.000000 only32 0
+cpucycles tracesetup 1 x86-tscasm precision 199 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 170 scaling 1.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 941 scaling 3.000000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 3200 scaling 3000.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3000000000
+cpucycles implementation default-perfevent
+cpucycles median 72 +12+0+0+0+0+0+0+0+5+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0
+cpucycles observed persecond 541500000...1812000000 with 1024 loops 3 microseconds
+cpucycles observed persecond 712333333...1212250000 with 2048 loops 5 microseconds
+cpucycles observed persecond 1193285714...1733600000 with 4096 loops 6 microseconds
+cpucycles observed persecond 1689176470...1804562500 with 8192 loops 33 microseconds
+cpucycles observed persecond 1713074626...1770600000 with 16384 loops 66 microseconds
+cpucycles observed persecond 1765107692...1795140625 with 32768 loops 129 microseconds
+cpucycles observed persecond 1785369649...1800603922 with 65536 loops 256 microseconds
+cpucycles observed persecond 1781377862...1796288462 with 131072 loops 261 microseconds
+cpucycles observed persecond 1772647398...1778247827 with 262144 loops 691 microseconds
+cpucycles observed persecond 1789670493...1794149598 with 524288 loops 870 microseconds
+cpucycles observed persecond 1860276211...1861561332 with 1048576 loops 3156 microseconds
+</code></pre>
+<p><code>gcc92</code>,
+SiFive Freedom U740,
+Ubuntu 22.04,
+Linux kernel 5.15.0-1014-generic:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 riscv64-rdcycle precision 8 scaling 1.000000 only32 0
+cpucycles tracesetup 1 default-perfevent precision 3024 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 2599 scaling 2.399988 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 2599 scaling 2399.987654 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2399987654
+cpucycles implementation riscv64-rdcycle
+cpucycles median 8 +33+27+1+1+1+1+0+0+0+22+0+0+0+0+0+0+0+628+0+0+0+7+0+0+0+145+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+158+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+22+0+0+0+0+0
+cpucycles observed persecond 530250000...1978000000 with 1024 loops 3 microseconds
+cpucycles observed persecond 831000000...1915666667 with 2048 loops 4 microseconds
+cpucycles observed persecond 1055750000...1689500000 with 4096 loops 7 microseconds
+cpucycles observed persecond 1045562500...1305428572 with 8192 loops 15 microseconds
+cpucycles observed persecond 1102700000...1236357143 with 16384 loops 29 microseconds
+cpucycles observed persecond 1176053571...1247444445 with 32768 loops 55 microseconds
+cpucycles observed persecond 1173321428...1209127273 with 65536 loops 111 microseconds
+cpucycles observed persecond 1187805429...1205210046 with 131072 loops 220 microseconds
+cpucycles observed persecond 1192415909...1201157535 with 262144 loops 439 microseconds
+cpucycles observed persecond 1194694760...1199247717 with 524288 loops 877 microseconds
+cpucycles observed persecond 1194656004...1197023034 with 1048576 loops 1781 microseconds
+</code></pre>
+<p><code>gcc103</code>,
+Apple M1 (Icestorm-M1 + Firestorm-M1),
+Debian unstable (bookworm),
+Linux kernel 6.0.0-rc5-asahi-00001-gc62bd3fe430f:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 arm64-pmc precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 1 arm64-vct precision 186 scaling 86.000000 only32 0
+cpucycles tracesetup 2 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 4 default-monotonic precision 285 scaling 2.064000 only32 0
+cpucycles tracesetup 5 default-gettimeofday precision 2264 scaling 2064.000000 only32 0
+cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 2064000000
+cpucycles implementation arm64-vct
+cpucycles median 0 +0+86+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0
+cpucycles observed persecond 1784500000...3655000000 with 8192 loops 3 microseconds
+cpucycles observed persecond 1773750000...2393666667 with 16384 loops 7 microseconds
+cpucycles observed persecond 1897733333...2222769231 with 32768 loops 14 microseconds
+cpucycles observed persecond 1951310344...2114962963 with 65536 loops 28 microseconds
+cpucycles observed persecond 2024071428...2107000000 with 131072 loops 55 microseconds
+cpucycles observed persecond 2041531531...2082935780 with 262144 loops 110 microseconds
+cpucycles observed persecond 2051158371...2071461188 with 524288 loops 220 microseconds
+cpucycles observed persecond 2058539682...2068309795 with 1048576 loops 440 microseconds
+</code></pre>
+<p><code>gcc112</code> (<code>gcc2-power8</code>),
+IBM POWER8E,
+CentOS 7.9 AltArch,
+Linux kernel 3.10.0-1127.13.1.el7.ppc64le:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 ppc64-mftb precision 251 scaling 7.207031 only32 0
+cpucycles tracesetup 1 default-perfevent precision 295 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 536 scaling 3.690000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 3890 scaling 3690.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3690000000
+cpucycles implementation ppc64-mftb
+cpucycles median 195 +2969-8+14+0-8+7-8-7+7+6-7-1+0-1+0+7+7-15+7-1-7+6+0+0-8+0+6+0-8+7+0+7-8-8-7-1+7-8+7+0-8+0+14-8-7+6+0-8+7+7-15+0-1+0-1+14+0-15+14+0-1+7+0
+cpucycles observed persecond 2603750000...5510000000 with 2048 loops 3 microseconds
+cpucycles observed persecond 3430500000...6052250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 3411333333...4457500000 with 8192 loops 11 microseconds
+cpucycles observed persecond 3548695652...4060333334 with 16384 loops 22 microseconds
+cpucycles observed persecond 3624977777...3876534884 with 32768 loops 44 microseconds
+cpucycles observed persecond 3621855555...3745363637 with 65536 loops 89 microseconds
+cpucycles observed persecond 3660157303...3722227273 with 131072 loops 177 microseconds
+cpucycles observed persecond 3680471751...3711622160 with 262144 loops 353 microseconds
+cpucycles observed persecond 3685321074...3700886525 with 524288 loops 706 microseconds
+cpucycles observed persecond 3687745930...3695537208 with 1048576 loops 1412 microseconds
+</code></pre>
+<p><code>gcc202</code>,
+UltraSparc T5,
+Debian unstable (bookworm),
+Linux kernel 5.19.0-2-sparc64-smp:</p>
+<pre><code>cpucycles version 20230105
+cpucycles tracesetup 0 sparc64-rdtick precision 65 scaling 1.000000 only32 0
+cpucycles tracesetup 1 default-perfevent precision 386 scaling 1.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 442 scaling 3.599910 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 3799 scaling 3599.910000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 3599910000
+cpucycles implementation sparc64-rdtick
+cpucycles median 73 +24+0+24+24+24+24+24+24+0+1+24+0+1+24+0+1+24+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0
+cpucycles observed persecond 2751500000...4258250000 with 4096 loops 5 microseconds
+cpucycles observed persecond 3289200000...4206875000 with 8192 loops 9 microseconds
+cpucycles observed persecond 3454789473...3900823530 with 16384 loops 18 microseconds
+cpucycles observed persecond 3452026315...3659888889 with 32768 loops 37 microseconds
+cpucycles observed persecond 3543770270...3650916667 with 65536 loops 73 microseconds
+cpucycles observed persecond 3567299319...3620662069 with 131072 loops 146 microseconds
+cpucycles observed persecond 3591373287...3618220690 with 262144 loops 291 microseconds
+cpucycles observed persecond 3597353344...3610774527 with 524288 loops 582 microseconds
+cpucycles observed persecond 3595899403...3603058071 with 1048576 loops 1172 microseconds
+</code></pre>
+<p>IBM z15:</p>
+<pre><code>cpucycles version 20230106
+cpucycles tracesetup 0 s390x-stckf precision 250 scaling 1.269531 only32 0
+cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
+cpucycles tracesetup 3 default-monotonic precision 272 scaling 5.200000 only32 0
+cpucycles tracesetup 4 default-gettimeofday precision 5400 scaling 5200.000000 only32 0
+cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
+cpucycles persecond 5200000000
+cpucycles implementation s390x-stckf
+cpucycles median 48 +87+8+0-2+0+0+38-2+0+1-3+1+28+0+3-3+1+0+28+0-2+3+0-2+36+0+0+0+1+0+28+0-2+0+3-2+35+1+0-2+0+3+28+0-2+0+0-2+3+25+3+0-2+0+1+35+1+0+0-2+0+28+0
+cpucycles observed persecond 4948941176...5627733334 with 8192 loops 16 microseconds
+cpucycles observed persecond 4104125000...5515666667 with 16384 loops 7 microseconds
+cpucycles observed persecond 5047076923...5987818182 with 32768 loops 12 microseconds
+cpucycles observed persecond 5044846153...5475708334 with 65536 loops 25 microseconds
+cpucycles observed persecond 5141313725...5357428572 with 131072 loops 50 microseconds
+cpucycles observed persecond 5150892156...5257250000 with 262144 loops 101 microseconds
+cpucycles observed persecond 5183421568...5236549505 with 524288 loops 203 microseconds
+cpucycles observed persecond 5190282555...5216582717 with 1048576 loops 406 microseconds
+</code></pre><hr><font size=1><b>Version:</b>
+This is version 2023.01.06 of the "Counters" web page.
+</font>
+</div>
+</body>
+</html>
diff --git a/cpu-cycles/libcpucycles/doc/html/download.html b/cpu-cycles/libcpucycles/doc/html/download.html
new file mode 100644
index 0000000000..9a4230bf1b
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/download.html
@@ -0,0 +1,75 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<style type="text/css">
+html{overflow-y:scroll}
+body{font-family:sans-serif}
+p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
+li p{font-size:1.0em}
+blockquote p{font-size:1.0em}
+tt{font-size:1.2em}
+code{font-size:1.2em}
+h1{font-size:1.5em}
+h2{font-size:1.3em}
+h3{font-size:1.0em}
+h1 a{text-decoration:none}
+table{border-collapse:collapse}
+th,td{border:1px solid black}
+table a{text-decoration:none}
+table tr{font-size:0.9em;line-height:1.6em}
+.links a:hover{text-decoration:underline}
+.links a:active{text-decoration:underline}
+.links img{width:200px;padding-left:1em}
+.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
+.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
+.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
+min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
+font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
+.here{border-bottom:0px;background-color:#ffffff}
+.away{background-color:#125d0d;}
+.away a{text-decoration:none;display:block;color:#ffffff}
+.away a:hover,.away a:active{text-decoration:underline}
+.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
+</style>
+<title>
+Download</title>
+</head>
+<body>
+<div class=headline>
+libcpucycles
+</div>
+<div class=nav>
+<div class="navt away"><a href=index.html>Intro</a>
+</div><div class="navt here">Download
+</div><div class="navt away"><a href=install.html>Install</a>
+</div><div class="navt away"><a href=api.html>API</a>
+</div><div class="navt away"><a href=counters.html>Counters</a>
+</div><div class="navt away"><a href=selection.html>Selection</a>
+</div><div class="navt away"><a href=security.html>Security</a>
+</div></div>
+<div class=main>
+<p>To download and unpack the latest version of libcpucycles:</p>
+<pre><code>    wget -m https://cpucycles.cr.yp.to/libcpucycles-latest-version.txt
+    version=$(cat cpucycles.cr.yp.to/libcpucycles-latest-version.txt)
+    wget -m https://cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
+    tar -xzf cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
+    cd libcpucycles-$version
+</code></pre>
+<p>Then <a href="install.html">install</a>.</p>
+<h3>Archives and changelog (reverse chronological)</h3>
+<p><a href="libcpucycles-20230115.tar.gz"><code>libcpucycles-20230115.tar.gz</code></a> <a href="libcpucycles-20230115.html">browse</a></p>
+<p>Update actual <code>cpucycles_version</code> behavior to match documentation.</p>
+<p><a href="libcpucycles-20230110.tar.gz"><code>libcpucycles-20230110.tar.gz</code></a> <a href="libcpucycles-20230110.html">browse</a></p>
+<p><code>doc/api.md</code>: Document <code>cpucycles_version()</code>.</p>
+<p>Add <code>s390x-stckf</code> counter.</p>
+<p><code>cpucycles/default-perfevent.c</code>: Read into <code>int64_t</code> instead of <code>long long</code>.
+Add comment explaining issues with <code>PERF_FORMAT_TOTAL_TIME_RUNNING</code>.</p>
+<p><code>configure</code>: Improve <code>uname</code> handling.</p>
+<p><code>doc/api.md</code>: Update description of default frequency.</p>
+<p><a href="libcpucycles-20230105.tar.gz"><code>libcpucycles-20230105.tar.gz</code></a> <a href="libcpucycles-20230105.html">browse</a></p><hr><font size=1><b>Version:</b>
+This is version 2023.01.15 of the "Download" web page.
+</font>
+</div>
+</body>
+</html>
diff --git a/cpu-cycles/libcpucycles/doc/html/index.html b/cpu-cycles/libcpucycles/doc/html/index.html
new file mode 100644
index 0000000000..c6ecb3e1d1
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/index.html
@@ -0,0 +1,88 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<style type="text/css">
+html{overflow-y:scroll}
+body{font-family:sans-serif}
+p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
+li p{font-size:1.0em}
+blockquote p{font-size:1.0em}
+tt{font-size:1.2em}
+code{font-size:1.2em}
+h1{font-size:1.5em}
+h2{font-size:1.3em}
+h3{font-size:1.0em}
+h1 a{text-decoration:none}
+table{border-collapse:collapse}
+th,td{border:1px solid black}
+table a{text-decoration:none}
+table tr{font-size:0.9em;line-height:1.6em}
+.links a:hover{text-decoration:underline}
+.links a:active{text-decoration:underline}
+.links img{width:200px;padding-left:1em}
+.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
+.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
+.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
+min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
+font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
+.here{border-bottom:0px;background-color:#ffffff}
+.away{background-color:#125d0d;}
+.away a{text-decoration:none;display:block;color:#ffffff}
+.away a:hover,.away a:active{text-decoration:underline}
+.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
+</style>
+<title>
+Intro</title>
+</head>
+<body>
+<div class=headline>
+libcpucycles
+</div>
+<div class=nav>
+<div class="navt here">Intro
+</div><div class="navt away"><a href=download.html>Download</a>
+</div><div class="navt away"><a href=install.html>Install</a>
+</div><div class="navt away"><a href=api.html>API</a>
+</div><div class="navt away"><a href=counters.html>Counters</a>
+</div><div class="navt away"><a href=selection.html>Selection</a>
+</div><div class="navt away"><a href=security.html>Security</a>
+</div></div>
+<div class=main>
+<p>libcpucycles is a public-domain microlibrary for counting CPU cycles.
+Cycle counts are not as detailed as
+<a href="https://gamozolabs.github.io/metrology/2019/08/19/sushi_roll.html">Falk diagrams</a>
+but are the most precise timers available to typical software; they are
+central tools used in understanding and improving software performance.</p>
+<p>The libcpucycles <a href="api.html">API</a> is simple: include <code>&lt;cpucycles.h&gt;</code>, call
+<code>cpucycles()</code> to receive a <code>long long</code> whenever desired, and link with
+<code>-lcpucycles</code>.</p>
+<p><a href="counters.html">Internally</a>, libcpucycles understands machine-level
+cycle counters for amd64 (both PMC and TSC), arm32, arm64 (both PMC and
+VCT), mips64, ppc32, ppc64, riscv32, riscv64, s390x, sparc64, and x86.
+libcpucycles also understands four OS-level mechanisms, which give
+varying levels of accuracy: <code>mach_absolute_time</code>, <code>perf_event</code>,
+<code>CLOCK_MONOTONIC</code>, and, as a fallback, microsecond-resolution
+<code>gettimeofday</code>.</p>
+<p>When the program first calls <code>cpucycles()</code>, libcpucycles automatically
+benchmarks the available mechanisms and <a href="selection.html">selects</a> the
+mechanism that does the best job. Subsequent <code>cpucycles()</code> calls are
+thread-safe and very fast. An accompanying <code>cpucycles-info</code> program
+prints a summary of cycle-counter accuracy.</p>
+<p>For comparison, there is a simple-sounding <code>__rdtsc()</code> API provided by
+compilers, but this works only on Intel/AMD CPUs and is generally noisier
+than PMC. There is a <code>__builtin_readcyclecounter()</code> that works on more
+CPUs, but this works only with <code>clang</code> and has the same noise problems.
+Both of these mechanisms put the burden on the caller to figure out what
+can be done on other CPUs. Various packages include their own more
+portable abstraction layers for counting cycles (see, e.g., FFTW's
+<a href="https://github.com/FFTW/fftw3/blob/master/kernel/cycle.h"><code>cycle.h</code></a>,
+used to automatically select from among multiple implementations
+provided by FFTW), but this creates per-package effort to keep up with
+the latest cycle counters. The goal of libcpucycles is to provide
+state-of-the-art cycle counting centrally for all packages to use.</p><hr><font size=1><b>Version:</b>
+This is version 2023.01.06 of the "Intro" web page.
+</font>
+</div>
+</body>
+</html>
diff --git a/cpu-cycles/libcpucycles/doc/html/install.html b/cpu-cycles/libcpucycles/doc/html/install.html
new file mode 100644
index 0000000000..4d2899f41c
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/install.html
@@ -0,0 +1,101 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<style type="text/css">
+html{overflow-y:scroll}
+body{font-family:sans-serif}
+p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
+li p{font-size:1.0em}
+blockquote p{font-size:1.0em}
+tt{font-size:1.2em}
+code{font-size:1.2em}
+h1{font-size:1.5em}
+h2{font-size:1.3em}
+h3{font-size:1.0em}
+h1 a{text-decoration:none}
+table{border-collapse:collapse}
+th,td{border:1px solid black}
+table a{text-decoration:none}
+table tr{font-size:0.9em;line-height:1.6em}
+.links a:hover{text-decoration:underline}
+.links a:active{text-decoration:underline}
+.links img{width:200px;padding-left:1em}
+.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
+.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
+.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
+min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
+font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
+.here{border-bottom:0px;background-color:#ffffff}
+.away{background-color:#125d0d;}
+.away a{text-decoration:none;display:block;color:#ffffff}
+.away a:hover,.away a:active{text-decoration:underline}
+.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
+</style>
+<title>
+Install</title>
+</head>
+<body>
+<div class=headline>
+libcpucycles
+</div>
+<div class=nav>
+<div class="navt away"><a href=index.html>Intro</a>
+</div><div class="navt away"><a href=download.html>Download</a>
+</div><div class="navt here">Install
+</div><div class="navt away"><a href=api.html>API</a>
+</div><div class="navt away"><a href=counters.html>Counters</a>
+</div><div class="navt away"><a href=selection.html>Selection</a>
+</div><div class="navt away"><a href=security.html>Security</a>
+</div></div>
+<div class=main>
+<p>Prerequisites: <code>python3</code>; <code>gcc</code> and/or <code>clang</code>. Currently tested only
+under Linux, but porting to other systems shouldn't be difficult.</p>
+<p>For sysadmins, to install in <code>/usr/local/{include,lib,bin}</code>:</p>
+<pre><code>    ./configure &amp;&amp; make -j8 install
+</code></pre>
+<p>For developers with an unprivileged account (typically with</p>
+<pre><code>    export LD_LIBRARY_PATH="$HOME/lib"
+    export LIBRARY_PATH="$HOME/lib"
+    export CPATH="$HOME/include"
+    export PATH="$HOME/bin:$PATH"
+</code></pre>
+<p>in <code>$HOME/.profile</code>), to install in <code>$HOME/{include,lib,bin}</code>:</p>
+<pre><code>    ./configure --prefix=$HOME &amp;&amp; make -j8 install
+</code></pre>
+<p>For distributors creating a package: Run</p>
+<pre><code>    ./configure --prefix=/usr &amp;&amp; make -j8
+</code></pre>
+<p>and then follow your usual packaging procedures for the
+<code>build/0/package</code> files:</p>
+<pre><code>    build/0/package/man/man3/cpucycles.3
+    build/0/package/include/cpucycles.h
+    build/0/package/lib/libcpucycles*
+    build/0/package/bin/cpucycles-info
+</code></pre>
+<p>There are some old systems where libcpucycles requires <code>-lrt</code> for
+<code>clock_gettime</code>; currently <code>libcpucycles.so</code> doesn't link to <code>-lrt</code>,
+so it's up to the caller to link to <code>-lrt</code>.</p>
+<p>More options: You can run</p>
+<pre><code>    ./configure --host=amd64
+</code></pre>
+<p>to override <code>./configure</code>'s guess of the architecture that it should
+compile for. The architecture controls which cycle counters to try
+compiling: e.g., <code>amd64</code> tries compiling <code>cpucycles/amd64*</code> and
+<code>cpucycles/default*</code>.</p>
+<p>Inside the <code>build</code> directory, <code>0</code> is symlinked to <code>amd64</code> for
+<code>--host=amd64</code>. Running <code>make clean</code> removes <code>build/amd64</code>. Re-running
+<code>./configure</code> automatically starts with <code>make clean</code>.</p>
+<p>A subsequent <code>./configure --host=arm64</code> will create <code>build/arm64</code> and
+symlink <code>0 -&gt; arm64</code>, without touching an existing <code>build/amd64</code>.
+However, cross-compilers aren't yet selected automatically.</p>
+<p>Compilers tried are listed in <code>compilers/default</code>. Each compiler
+includes <code>-fPIC</code> to create a shared library, <code>-fvisibility=hidden</code> to
+hide non-public symbols in the library, and <code>-fwrapv</code> to switch to a
+slightly less dangerous version of C. The first compiler that seems to
+work is used to compile everything.</p><hr><font size=1><b>Version:</b>
+This is version 2023.01.05 of the "Install" web page.
+</font>
+</div>
+</body>
+</html>
diff --git a/cpu-cycles/libcpucycles/doc/html/security.html b/cpu-cycles/libcpucycles/doc/html/security.html
new file mode 100644
index 0000000000..e978bcddb4
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/security.html
@@ -0,0 +1,122 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<style type="text/css">
+html{overflow-y:scroll}
+body{font-family:sans-serif}
+p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
+li p{font-size:1.0em}
+blockquote p{font-size:1.0em}
+tt{font-size:1.2em}
+code{font-size:1.2em}
+h1{font-size:1.5em}
+h2{font-size:1.3em}
+h3{font-size:1.0em}
+h1 a{text-decoration:none}
+table{border-collapse:collapse}
+th,td{border:1px solid black}
+table a{text-decoration:none}
+table tr{font-size:0.9em;line-height:1.6em}
+.links a:hover{text-decoration:underline}
+.links a:active{text-decoration:underline}
+.links img{width:200px;padding-left:1em}
+.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
+.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
+.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
+min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
+font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
+.here{border-bottom:0px;background-color:#ffffff}
+.away{background-color:#125d0d;}
+.away a{text-decoration:none;display:block;color:#ffffff}
+.away a:hover,.away a:active{text-decoration:underline}
+.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
+</style>
+<title>
+Security</title>
+</head>
+<body>
+<div class=headline>
+libcpucycles
+</div>
+<div class=nav>
+<div class="navt away"><a href=index.html>Intro</a>
+</div><div class="navt away"><a href=download.html>Download</a>
+</div><div class="navt away"><a href=install.html>Install</a>
+</div><div class="navt away"><a href=api.html>API</a>
+</div><div class="navt away"><a href=counters.html>Counters</a>
+</div><div class="navt away"><a href=selection.html>Selection</a>
+</div><div class="navt here">Security
+</div></div>
+<div class=main>
+<p>Many security systems have been shown to be breakable by "timing
+attacks". These attacks extract secrets by analyzing timings of the
+legitimate user's operations on secret data. See the June 2022 survey
+page <a href="https://timing.attacks.cr.yp.to">https://timing.attacks.cr.yp.to</a>
+for an overview and further references.</p>
+<p>Sometimes these attacks are used as motivation to disable the attacker's
+access to various timing mechanisms. For example, Firefox rounds its
+<code>performance.now</code> timer to 1-millisecond resolution
+<a href="https://developer.mozilla.org/en-US/docs/Web/API/Performance/now">"to mitigate potential security threats"</a>.</p>
+<p>As another example, reducing <code>/proc/sys/kernel/perf_event_paranoid</code>
+under Linux to 2 (from 3 or higher), so that libcpucycles has access to
+the best available Intel/AMD cycle counter (RDPMC), also means making
+this cycle counter and other performance-monitoring counters available
+to any attacker-controlled software running on the computer. Perhaps
+this helps timing attacks, not to mention the possibility of opening up
+other vulnerabilities via the complicated <code>perf_event</code> interface.</p>
+<p>As yet another example, ARM CPUs disable user access to the main CPU
+cycle counter by default. Installing a kernel module to enable user
+access to the cycle counter could help attacks.</p>
+<p>Given the availability of simple mechanisms to disable RDPMC etc., it is
+easy to recommend using those mechanisms. To avoid creating unnecessary
+tension between those recommendations and the use of libcpucycles,
+applications that use libcpucycles should be structured so that
+high-resolution timers are used only on controlled development and
+benchmarking machines, not on general end-user machines.</p>
+<p>This structure might seem incompatible with using cycle counts to
+automatically select the best of multiple options, as in FFTW. However,
+new infrastructure introduced in <a href="https://lib25519.cr.yp.to">lib25519</a>
+automatically selects options on end-user machines based on cycle counts
+that were <em>collected on benchmarking machines</em>.</p>
+<p>The above text should not be understood as endorsing the idea that
+disabling timers is an <em>effective</em> defense against timing attacks.
+Certainly disabling high-resolution timers is not sufficient for
+security: there are many ways for attackers to amplify timing signals
+and to statistically filter out noise from low-resolution timers.
+Disabling <em>every</em> standard timing mechanism on the machine does not stop
+the attacker from accessing a remote timer or a counter maintained by
+the attacker's software. Perhaps disabling timers sometimes makes the
+difference between a feasible attack and an infeasible attack, but
+evaluating this is extremely difficult.</p>
+<p>Meanwhile there is an auditable methodology available to stop timing
+attacks: constant-time programming, which systematically cuts off data
+flow from secrets to timings.</p>
+<p>For example, secrets affect a CPU's power consumption, and Turbo Boost
+creates data flow from power consumption to timings, as illustrated by
+the <a href="https://www.hertzbleed.com">Hertzbleed attack</a> extracting secret
+keys from the SIKE cryptosystem (before SIKE was broken in other ways),
+and an <a href="https://arxiv.org/abs/2206.07012">independent attack</a>
+extracting secret AES keys. Consequently, the constant-time methodology
+does not allow Turbo Boost.</p>
+<p>This is why <a href="https://timing.attacks.cr.yp.to">https://timing.attacks.cr.yp.to</a>
+recommends turning off Turbo Boost "right now", and explains the
+mechanisms available to do this. One non-security reason that it was
+already normal (although not universal) for manufacturers to provide
+these mechanisms to end users is that Turbo Boost has a reputation for
+causing premature hardware failures. Turbo Boost also provides very
+little speed benefit for modern multithreaded vectorized applications.</p>
+<p>Another reaction to timing attacks is to apply "masking" techniques.
+These techniques <em>seem</em> to make it more difficult for attackers to
+extract secrets from power consumption and other side channels. However,
+as <a href="https://timing.attacks.cr.yp.to">https://timing.attacks.cr.yp.to</a>
+explains, it is "practically impossible for an auditor to obtain any
+real assurance that these techniques are secure". See the December 2022
+paper
+<a href="https://eprint.iacr.org/2022/1713">"Breaking a fifth-order masked implementation of CRYSTALS-Kyber by copy-paste"</a>
+for a newer example of a security failure in a masked implementation.</p><hr><font size=1><b>Version:</b>
+This is version 2023.01.05 of the "Security" web page.
+</font>
+</div>
+</body>
+</html>
diff --git a/cpu-cycles/libcpucycles/doc/html/selection.html b/cpu-cycles/libcpucycles/doc/html/selection.html
new file mode 100644
index 0000000000..eab7c4ba9a
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/html/selection.html
@@ -0,0 +1,158 @@
+<html>
+<head>
+<meta http-equiv="content-type" content="text/html; charset=utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<style type="text/css">
+html{overflow-y:scroll}
+body{font-family:sans-serif}
+p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
+li p{font-size:1.0em}
+blockquote p{font-size:1.0em}
+tt{font-size:1.2em}
+code{font-size:1.2em}
+h1{font-size:1.5em}
+h2{font-size:1.3em}
+h3{font-size:1.0em}
+h1 a{text-decoration:none}
+table{border-collapse:collapse}
+th,td{border:1px solid black}
+table a{text-decoration:none}
+table tr{font-size:0.9em;line-height:1.6em}
+.links a:hover{text-decoration:underline}
+.links a:active{text-decoration:underline}
+.links img{width:200px;padding-left:1em}
+.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
+.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
+.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
+min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
+font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
+.here{border-bottom:0px;background-color:#ffffff}
+.away{background-color:#125d0d;}
+.away a{text-decoration:none;display:block;color:#ffffff}
+.away a:hover,.away a:active{text-decoration:underline}
+.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
+</style>
+<title>
+Selection</title>
+</head>
+<body>
+<div class=headline>
+libcpucycles
+</div>
+<div class=nav>
+<div class="navt away"><a href=index.html>Intro</a>
+</div><div class="navt away"><a href=download.html>Download</a>
+</div><div class="navt away"><a href=install.html>Install</a>
+</div><div class="navt away"><a href=api.html>API</a>
+</div><div class="navt away"><a href=counters.html>Counters</a>
+</div><div class="navt here">Selection
+</div><div class="navt away"><a href=security.html>Security</a>
+</div></div>
+<div class=main>
+<p>Here is how libcpucycles decides which cycle counter to use. The
+underlying principles are as follows:</p>
+<ul>
+<li>
+<p>Failure is not allowed. Using a low-resolution timer such as
+  <code>gettimeofday()</code> to estimate cycle counts is not desirable but is better
+  than providing no information.</p>
+</li>
+<li>
+<p>A counter that does well on some CPUs and OSes can do badly on others.
+  The counter selection in libcpucycles is based not just on rules set
+  at compile time but also on measurements of how well the counters
+  perform when the program first calls <code>cpucycles()</code>.</p>
+</li>
+<li>
+<p>A critical application of cycle counting is collecting cycle counts
+  for multiple options to see which option is faster. It is the caller's
+  responsibility to compute medians of cycle counts for many runs of
+  whatever is being benchmarked: medians filter out occasional
+  cycle-count jumps caused by migration to another core (if the
+  benchmark is not pinned to a single core) or interrupts from other OS
+  activity. libcpucycles does not reject an otherwise attractive counter
+  merely because of occasional jumps.</p>
+</li>
+<li>
+<p>Cycle-counting overhead is not desirable, but does not directly affect
+  comparisons of multiple options measured using the same cycle counter,
+  so it is less important than consistent major errors such as treating
+  2^32 + x cycles as x cycles. (Performance experts seeing a function
+  that takes billions of cycles usually focus on smaller subroutines,
+  but libcpucycles should not break larger measurements.) This is why
+  libcpucycles does not provide direct access to 32-bit cycle counters:
+  it provides wrappers that combine the counters with gettimeofday() to
+  produce 64 bits, even though this incurs some extra overhead.</p>
+</li>
+<li>
+<p>The noise introduced by typical off-core clocks, such as multiplying a
+  24MHz clock by 86 to estimate cycles on a 2.064GHz CPU core, comes in
+  small part from low resolution but much more from changes in CPU
+  frequency: e.g., a 10000-cycle computation might be measured as 20000
+  cycles when the CPU enters a power-saving mode. When libcpucycles has
+  access to what is believed to be an on-core cycle counter, it uses
+  that even when its measurements show some noise. (Choosing an on-core
+  cycle counter does not magically eliminate the change in the relative
+  speed of the CPU and DRAM; the usual advice to warm up the CPU and set
+  constant frequencies if possible still applies.)</p>
+</li>
+</ul>
+<p>When <code>cpucycles()</code> is first called, libcpucycles tries running each
+cycle counter that has been compiled into the library. For example, for
+64-bit ARM CPUs, libcpucycles will try <code>arm64-pmc</code>, <code>arm64-vct</code>,
+<code>default-gettimeofday</code>, <code>default-mach</code>, <code>default-monotonic</code>, and
+<code>default-perfevent</code>, minus any of those that failed to compile.</p>
+<p>Cycle counters that fail at run time with SIGILL (or SIGFPE or SIGBUS or
+SIGSEGV) are eliminated from the list. For example, <code>arm64-pmc</code> will
+fail with SIGILL if the kernel does not allow user access to
+<code>PMCCNTR_EL0</code>. Beware that libcpucycles does not catch SIGILL after its
+initial tests: if the kernel initially allows user access to
+<code>PMCCNTR_EL0</code> but later turns it off then <code>arm64-pmc</code> will crash.</p>
+<p>Independently of these counters, libcpucycles uses various OS mechanisms
+to obtain an <em>estimate</em> of the CPU frequency. This estimate is also
+available to the caller as <code>cpucycles_persecond()</code>.</p>
+<p>The methods that libcpucycles uses to ask the OS for an estimated CPU
+frequency fail on some OS-CPU combinations, in which case libcpucycles
+falls back to a <code>cpucyclespersecond</code> environment variable, or, if that
+variable does not exist, an estimate of 2399987654 cycles per second.
+(This estimate is in a realistic range of CPU speeds, and is close to
+multiples of 24MHz, 25MHz, and 19.2MHz, which are common crystal
+frequencies.) The sysadmin can create <code>/etc/cpucyclespersecond</code> to
+override all of the OS mechanisms.</p>
+<p>For counters that do not ask for scaling, the estimated CPU frequency is
+shown in <code>cpucycles-info</code> as a double-check on the counter results. For
+counters that ask for scaling, libcpucycles uses the estimated CPU
+frequency to compute the scaling, so this is not a double-check. If a
+counter asks for scaling and the estimated CPU frequency does not seem
+close to a multiple of the counter frequency (possibly with a small
+power-of-2 denominator) then libcpucycles will throw the counter away,
+except in the case of fixed-resolution OS counters such as
+<code>gettimeofday</code> and <code>CLOCK_MONOTONIC</code>.</p>
+<p>libcpucycles computes a precision estimate for each counter (times any
+applicable scaling) as follows. Call the counter 1000 times. Check that
+the counter has never decreased, and has increased at least once. (A
+counter where the decrease/increase checks fail is retried 10 times, so
+10000 calls overall, and removed if it fails all 10 times.) The
+precision estimate is then the smallest nonzero difference between
+adjacent counter results, plus a penalty explained below.</p>
+<p>The penalty is 100 cycles for off-core counters (including RDTSC) and
+<code>default-perfevent</code>, and 200 cycles for fixed-resolution OS counters.
+For example, an on-core CPU cycle counter will be selected even if it
+actually has, e.g., a resolution of 8 cycles and 50 cycles of overhead.</p>
+<p>Finally, libcpucycles selects the counter where the precision estimate
+is the smallest number of cycles. Note that an inaccurate estimate of
+CPU frequency can influence the choice between a scaled counter and an
+unscaled counter.</p>
+<p>libcpucycles does <em>not</em> carry out its counter selection (typically tens
+of milliseconds, sometimes even more) as a static initializer; callers
+are presumed to not want to incur the cost of initialization unless and
+until they are actually using <code>cpucycles()</code>. A multithreaded caller thus
+has to place locks around any possibly-first call to <code>cpucycles()</code>, or
+create its own static initializer (an <code>__attribute__((constructor))</code>
+function) with an initial <code>cpucycles()</code> call so that all subsequent
+<code>cpucycles()</code> calls are thread-safe.</p><hr><font size=1><b>Version:</b>
+This is version 2023.01.05 of the "Selection" web page.
+</font>
+</div>
+</body>
+</html>
diff --git a/cpu-cycles/libcpucycles/doc/install.md b/cpu-cycles/libcpucycles/doc/install.md
new file mode 100644
index 0000000000..9642ead64b
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/install.md
@@ -0,0 +1,56 @@
+Prerequisites: `python3`; `gcc` and/or `clang`. Currently tested only
+under Linux, but porting to other systems shouldn't be difficult.
+
+For sysadmins, to install in `/usr/local/{include,lib,bin}`:
+
+        ./configure && make -j8 install
+
+For developers with an unprivileged account (typically with
+
+        export LD_LIBRARY_PATH="$HOME/lib"
+        export LIBRARY_PATH="$HOME/lib"
+        export CPATH="$HOME/include"
+        export PATH="$HOME/bin:$PATH"
+
+in `$HOME/.profile`), to install in `$HOME/{include,lib,bin}`:
+
+        ./configure --prefix=$HOME && make -j8 install
+
+For distributors creating a package: Run
+
+        ./configure --prefix=/usr && make -j8
+
+and then follow your usual packaging procedures for the
+`build/0/package` files:
+
+        build/0/package/man/man3/cpucycles.3
+        build/0/package/include/cpucycles.h
+        build/0/package/lib/libcpucycles*
+        build/0/package/bin/cpucycles-info
+
+There are some old systems where libcpucycles requires `-lrt` for
+`clock_gettime`; currently `libcpucycles.so` doesn't link to `-lrt`,
+so it's up to the caller to link to `-lrt`.
+
+More options: You can run
+
+        ./configure --host=amd64
+
+to override `./configure`'s guess of the architecture that it should
+compile for. The architecture controls which cycle counters to try
+compiling: e.g., `amd64` tries compiling `cpucycles/amd64*` and
+`cpucycles/default*`.
+
+Inside the `build` directory, `0` is symlinked to `amd64` for
+`--host=amd64`. Running `make clean` removes `build/amd64`. Re-running
+`./configure` automatically starts with `make clean`.
+
+A subsequent `./configure --host=arm64` will create `build/arm64` and
+symlink `0 -> arm64`, without touching an existing `build/amd64`.
+However, cross-compilers aren't yet selected automatically.
+
+Compilers tried are listed in `compilers/default`. Each compiler
+includes `-fPIC` to create a shared library, `-fvisibility=hidden` to
+hide non-public symbols in the library, and `-fwrapv` to switch to a
+slightly less dangerous version of C. The first compiler that seems to
+work is used to compile everything.
diff --git a/cpu-cycles/libcpucycles/doc/man/cpucycles.3 b/cpu-cycles/libcpucycles/doc/man/cpucycles.3
new file mode 100644
index 0000000000..bb7f9134fb
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/man/cpucycles.3
@@ -0,0 +1,57 @@
+.\" Automatically generated by Pandoc 2.9.2.1
+.\"
+.TH "cpucycles" "3" "" "" ""
+.hy
+.SS NAME
+.PP
+cpucycles - count CPU cycles
+.SS SYNOPSIS
+.IP
+.nf
+\f[C]
+#include <cpucycles.h>
+
+long long count = cpucycles();
+long long persecond = cpucycles_persecond();
+const char *implementation = cpucycles_implementation();
+const char *version = cpucycles_version();
+\f[R]
+.fi
+.PP
+Link with \f[C]-lcpucycles\f[R].
+Old systems may also need \f[C]-lrt\f[R].
+.SS DESCRIPTION
+.PP
+\f[C]cpucycles()\f[R] returns an estimate for the number of CPU cycles
+that have occurred since an unspecified time in the past (perhaps system
+boot, perhaps program startup).
+.PP
+Accessing true cycle counters can be difficult on some CPUs and
+operating systems.
+\f[C]cpucycles()\f[R] does its best to produce accurate results, but
+selects a low-precision counter if the only other option is failure.
+.PP
+\f[C]cpucycles_persecond()\f[R] returns an estimate for the number of
+CPU cycles per second.
+This estimate comes from \f[C]/etc/cpucyclespersecond\f[R] if that file
+exists, otherwise from various OS mechanisms, otherwise from the
+\f[C]cpucyclespersecond\f[R] environment variable if that is set,
+otherwise 2399987654.
+.PP
+\f[C]cpucycles_implementation()\f[R] returns the name of the counter in
+use: e.g., \f[C]\[dq]amd64-pmc\[dq]\f[R].
+.PP
+\f[C]cpucycles_version()\f[R] returns the \f[C]libcpucycles\f[R] version
+number as a string: e.g., \f[C]\[dq]20230115\[dq]\f[R].
+Results of \f[C]cpucycles_implementation()\f[R] should be interpreted
+relative to \f[C]cpucycles_version()\f[R].
+.PP
+\f[C]cpucycles\f[R] is actually a function pointer.
+The first call to \f[C]cpucycles()\f[R] or
+\f[C]cpucycles_persecond()\f[R] or \f[C]cpucycles_implementation()\f[R]
+selects one of the available counters and updates the
+\f[C]cpucycles\f[R] pointer accordingly.
+Subsequent calls to \f[C]cpucycles()\f[R] are thread-safe.
+.SS SEE ALSO
+.PP
+\f[B]gettimeofday\f[R](2), \f[B]clock_gettime\f[R](2)
diff --git a/cpu-cycles/libcpucycles/doc/readme.md b/cpu-cycles/libcpucycles/doc/readme.md
new file mode 100644
index 0000000000..98a42eea41
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/readme.md
@@ -0,0 +1,36 @@
+libcpucycles is a public-domain microlibrary for counting CPU cycles.
+Cycle counts are not as detailed as
+[Falk diagrams](https://gamozolabs.github.io/metrology/2019/08/19/sushi_roll.html)
+but are the most precise timers available to typical software; they are
+central tools used in understanding and improving software performance.
+
+The libcpucycles [API](api.html) is simple: include `<cpucycles.h>`, call
+`cpucycles()` to receive a `long long` whenever desired, and link with
+`-lcpucycles`.
+
+[Internally](counters.html), libcpucycles understands machine-level
+cycle counters for amd64 (both PMC and TSC), arm32, arm64 (both PMC and
+VCT), mips64, ppc32, ppc64, riscv32, riscv64, s390x, sparc64, and x86.
+libcpucycles also understands four OS-level mechanisms, which give
+varying levels of accuracy: `mach_absolute_time`, `perf_event`,
+`CLOCK_MONOTONIC`, and, as a fallback, microsecond-resolution
+`gettimeofday`.
+
+When the program first calls `cpucycles()`, libcpucycles automatically
+benchmarks the available mechanisms and [selects](selection.html) the
+mechanism that does the best job. Subsequent `cpucycles()` calls are
+thread-safe and very fast. An accompanying `cpucycles-info` program
+prints a summary of cycle-counter accuracy.
+
+For comparison, there is a simple-sounding `__rdtsc()` API provided by
+compilers, but this works only on Intel/AMD CPUs and is generally noisier
+than PMC. There is a `__builtin_readcyclecounter()` that works on more
+CPUs, but this works only with `clang` and has the same noise problems.
+Both of these mechanisms put the burden on the caller to figure out what
+can be done on other CPUs. Various packages include their own more
+portable abstraction layers for counting cycles (see, e.g., FFTW's
+[`cycle.h`](https://github.com/FFTW/fftw3/blob/master/kernel/cycle.h),
+used to automatically select from among multiple implementations
+provided by FFTW), but this creates per-package effort to keep up with
+the latest cycle counters. The goal of libcpucycles is to provide
+state-of-the-art cycle counting centrally for all packages to use.
diff --git a/cpu-cycles/libcpucycles/doc/security.md b/cpu-cycles/libcpucycles/doc/security.md
new file mode 100644
index 0000000000..554a20f0e1
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/security.md
@@ -0,0 +1,76 @@
+Many security systems have been shown to be breakable by "timing
+attacks". These attacks extract secrets by analyzing timings of the
+legitimate user's operations on secret data. See the June 2022 survey
+page [https://timing.attacks.cr.yp.to](https://timing.attacks.cr.yp.to)
+for an overview and further references.
+
+Sometimes these attacks are used as motivation to disable the attacker's
+access to various timing mechanisms. For example, Firefox rounds its
+`performance.now` timer to 1-millisecond resolution
+["to mitigate potential security threats"](https://developer.mozilla.org/en-US/docs/Web/API/Performance/now).
+
+As another example, reducing `/proc/sys/kernel/perf_event_paranoid`
+under Linux to 2 (from 3 or higher), so that libcpucycles has access to
+the best available Intel/AMD cycle counter (RDPMC), also means making
+this cycle counter and other performance-monitoring counters available
+to any attacker-controlled software running on the computer. Perhaps
+this helps timing attacks, not to mention the possibility of opening up
+other vulnerabilities via the complicated `perf_event` interface.
+
+As yet another example, ARM CPUs disable user access to the main CPU
+cycle counter by default. Installing a kernel module to enable user
+access to the cycle counter could help attacks.
+
+Given the availability of simple mechanisms to disable RDPMC etc., it is
+easy to recommend using those mechanisms. To avoid creating unnecessary
+tension between those recommendations and the use of libcpucycles,
+applications that use libcpucycles should be structured so that
+high-resolution timers are used only on controlled development and
+benchmarking machines, not on general end-user machines.
+
+This structure might seem incompatible with using cycle counts to
+automatically select the best of multiple options, as in FFTW. However,
+new infrastructure introduced in [lib25519](https://lib25519.cr.yp.to)
+automatically selects options on end-user machines based on cycle counts
+that were _collected on benchmarking machines_.
+
+The above text should not be understood as endorsing the idea that
+disabling timers is an _effective_ defense against timing attacks.
+Certainly disabling high-resolution timers is not sufficient for
+security: there are many ways for attackers to amplify timing signals
+and to statistically filter out noise from low-resolution timers.
+Disabling _every_ standard timing mechanism on the machine does not stop
+the attacker from accessing a remote timer or a counter maintained by
+the attacker's software. Perhaps disabling timers sometimes makes the
+difference between a feasible attack and an infeasible attack, but
+evaluating this is extremely difficult.
+
+Meanwhile there is an auditable methodology available to stop timing
+attacks: constant-time programming, which systematically cuts off data
+flow from secrets to timings.
+
+For example, secrets affect a CPU's power consumption, and Turbo Boost
+creates data flow from power consumption to timings, as illustrated by
+the [Hertzbleed attack](https://www.hertzbleed.com) extracting secret
+keys from the SIKE cryptosystem (before SIKE was broken in other ways),
+and an [independent attack](https://arxiv.org/abs/2206.07012)
+extracting secret AES keys. Consequently, the constant-time methodology
+does not allow Turbo Boost.
+
+This is why [https://timing.attacks.cr.yp.to](https://timing.attacks.cr.yp.to)
+recommends turning off Turbo Boost "right now", and explains the
+mechanisms available to do this. One non-security reason that it was
+already normal (although not universal) for manufacturers to provide
+these mechanisms to end users is that Turbo Boost has a reputation for
+causing premature hardware failures. Turbo Boost also provides very
+little speed benefit for modern multithreaded vectorized applications.
+
+Another reaction to timing attacks is to apply "masking" techniques.
+These techniques _seem_ to make it more difficult for attackers to
+extract secrets from power consumption and other side channels. However,
+as [https://timing.attacks.cr.yp.to](https://timing.attacks.cr.yp.to)
+explains, it is "practically impossible for an auditor to obtain any
+real assurance that these techniques are secure". See the December 2022
+paper
+["Breaking a fifth-order masked implementation of CRYSTALS-Kyber by copy-paste"](https://eprint.iacr.org/2022/1713)
+for a newer example of a security failure in a masked implementation.
diff --git a/cpu-cycles/libcpucycles/doc/selection.md b/cpu-cycles/libcpucycles/doc/selection.md
new file mode 100644
index 0000000000..847f7820dc
--- /dev/null
+++ b/cpu-cycles/libcpucycles/doc/selection.md
@@ -0,0 +1,104 @@
+Here is how libcpucycles decides which cycle counter to use. The
+underlying principles are as follows:
+
+* Failure is not allowed. Using a low-resolution timer such as
+  `gettimeofday()` to estimate cycle counts is not desirable but is better
+  than providing no information.
+
+* A counter that does well on some CPUs and OSes can do badly on others.
+  The counter selection in libcpucycles is based not just on rules set
+  at compile time but also on measurements of how well the counters
+  perform when the program first calls `cpucycles()`.
+
+* A critical application of cycle counting is collecting cycle counts
+  for multiple options to see which option is faster. It is the caller's
+  responsibility to compute medians of cycle counts for many runs of
+  whatever is being benchmarked: medians filter out occasional
+  cycle-count jumps caused by migration to another core (if the
+  benchmark is not pinned to a single core) or interrupts from other OS
+  activity. libcpucycles does not reject an otherwise attractive counter
+  merely because of occasional jumps.
+
+* Cycle-counting overhead is not desirable, but does not directly affect
+  comparisons of multiple options measured using the same cycle counter,
+  so it is less important than consistent major errors such as treating
+  2^32 + x cycles as x cycles. (Performance experts seeing a function
+  that takes billions of cycles usually focus on smaller subroutines,
+  but libcpucycles should not break larger measurements.) This is why
+  libcpucycles does not provide direct access to 32-bit cycle counters:
+  it provides wrappers that combine the counters with gettimeofday() to
+  produce 64 bits, even though this incurs some extra overhead.
+
+* The noise introduced by typical off-core clocks, such as multiplying a
+  24MHz clock by 86 to estimate cycles on a 2.064GHz CPU core, comes in
+  small part from low resolution but much more from changes in CPU
+  frequency: e.g., a 10000-cycle computation might be measured as 20000
+  cycles when the CPU enters a power-saving mode. When libcpucycles has
+  access to what is believed to be an on-core cycle counter, it uses
+  that even when its measurements show some noise. (Choosing an on-core
+  cycle counter does not magically eliminate the change in the relative
+  speed of the CPU and DRAM; the usual advice to warm up the CPU and set
+  constant frequencies if possible still applies.)
+
+When `cpucycles()` is first called, libcpucycles tries running each
+cycle counter that has been compiled into the library. For example, for
+64-bit ARM CPUs, libcpucycles will try `arm64-pmc`, `arm64-vct`,
+`default-gettimeofday`, `default-mach`, `default-monotonic`, and
+`default-perfevent`, minus any of those that failed to compile.
+
+Cycle counters that fail at run time with SIGILL (or SIGFPE or SIGBUS or
+SIGSEGV) are eliminated from the list. For example, `arm64-pmc` will
+fail with SIGILL if the kernel does not allow user access to
+`PMCCNTR_EL0`. Beware that libcpucycles does not catch SIGILL after its
+initial tests: if the kernel initially allows user access to
+`PMCCNTR_EL0` but later turns it off then `arm64-pmc` will crash.
+
+Independently of these counters, libcpucycles uses various OS mechanisms
+to obtain an _estimate_ of the CPU frequency. This estimate is also
+available to the caller as `cpucycles_persecond()`.
+
+The methods that libcpucycles uses to ask the OS for an estimated CPU
+frequency fail on some OS-CPU combinations, in which case libcpucycles
+falls back to a `cpucyclespersecond` environment variable, or, if that
+variable does not exist, an estimate of 2399987654 cycles per second.
+(This estimate is in a realistic range of CPU speeds, and is close to
+multiples of 24MHz, 25MHz, and 19.2MHz, which are common crystal
+frequencies.) The sysadmin can create `/etc/cpucyclespersecond` to
+override all of the OS mechanisms.
+
+For counters that do not ask for scaling, the estimated CPU frequency is
+shown in `cpucycles-info` as a double-check on the counter results. For
+counters that ask for scaling, libcpucycles uses the estimated CPU
+frequency to compute the scaling, so this is not a double-check. If a
+counter asks for scaling and the estimated CPU frequency does not seem
+close to a multiple of the counter frequency (possibly with a small
+power-of-2 denominator) then libcpucycles will throw the counter away,
+except in the case of fixed-resolution OS counters such as
+`gettimeofday` and `CLOCK_MONOTONIC`.
+
+libcpucycles computes a precision estimate for each counter (times any
+applicable scaling) as follows. Call the counter 1000 times. Check that
+the counter has never decreased, and has increased at least once. (A
+counter where the decrease/increase checks fail is retried 10 times, so
+10000 calls overall, and removed if it fails all 10 times.) The
+precision estimate is then the smallest nonzero difference between
+adjacent counter results, plus a penalty explained below.
+
+The penalty is 100 cycles for off-core counters (including RDTSC) and
+`default-perfevent`, and 200 cycles for fixed-resolution OS counters.
+For example, an on-core CPU cycle counter will be selected even if it
+actually has, e.g., a resolution of 8 cycles and 50 cycles of overhead.
+
+Finally, libcpucycles selects the counter where the precision estimate
+is the smallest number of cycles. Note that an inaccurate estimate of
+CPU frequency can influence the choice between a scaled counter and an
+unscaled counter.
+
+libcpucycles does _not_ carry out its counter selection (typically tens
+of milliseconds, sometimes even more) as a static initializer; callers
+are presumed to not want to incur the cost of initialization unless and
+until they are actually using `cpucycles()`. A multithreaded caller thus
+has to place locks around any possibly-first call to `cpucycles()`, or
+create its own static initializer (an `__attribute__((constructor))`
+function) with an initial `cpucycles()` call so that all subsequent
+`cpucycles()` calls are thread-safe.
diff --git a/cpu-cycles/libcpucycles/scripts-build/install b/cpu-cycles/libcpucycles/scripts-build/install
new file mode 100755
index 0000000000..7ea5c77f67
--- /dev/null
+++ b/cpu-cycles/libcpucycles/scripts-build/install
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import shutil
+import tempfile
+
+prefix = sys.argv[1]
+dirs = 'man/man3','lib','include','bin'
+install = {}
+
+os.umask(0o22)
+
+for target in dirs:
+  install[target] = '%s/%s'%(prefix,target)
+  os.makedirs(install[target],exist_ok=True)
+
+os.umask(0o77)
+
+for target in dirs:
+  with tempfile.TemporaryDirectory(dir=install[target]) as t:
+    for fn in sorted(os.listdir('package/'+target)):
+      try:
+        shutil.copy2('package/%s/%s' % (target,fn),'%s/%s' % (t,fn),follow_symlinks=False)
+      except TypeError: # XXX: old python3; should copy symlinks manually
+        shutil.copy2('package/%s/%s' % (target,fn),'%s/%s' % (t,fn))
+      os.rename('%s/%s' % (t,fn),'%s/%s' % (install[target],fn))
diff --git a/cpu-cycles/libcpucycles/scripts-build/staticlib b/cpu-cycles/libcpucycles/scripts-build/staticlib
new file mode 100755
index 0000000000..bb23658fd5
--- /dev/null
+++ b/cpu-cycles/libcpucycles/scripts-build/staticlib
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+rm -f package/lib/libcpucycles.a
+ar cr package/lib/libcpucycles.a "$@"
+ranlib package/lib/libcpucycles.a || :
+chmod 644 package/lib/libcpucycles.a
diff --git a/cpu-cycles/libcpucycles/version b/cpu-cycles/libcpucycles/version
new file mode 100644
index 0000000000..dbdecdf7fc
--- /dev/null
+++ b/cpu-cycles/libcpucycles/version
@@ -0,0 +1 @@
+20230115
diff --git a/cpu-cycles/src/bindings.rs b/cpu-cycles/src/bindings.rs
new file mode 100644
index 0000000000..4065d3fd18
--- /dev/null
+++ b/cpu-cycles/src/bindings.rs
@@ -0,0 +1,9 @@
+#[link(name = "cpucycles", kind = "static")]
+extern "C" {
+    pub static mut cpucycles:
+        ::std::option::Option<unsafe extern "C" fn() -> ::std::os::raw::c_longlong>;
+    pub fn cpucycles_implementation() -> *const ::std::os::raw::c_char;
+    pub fn cpucycles_version() -> *const ::std::os::raw::c_char;
+    pub fn cpucycles_persecond() -> ::std::os::raw::c_longlong;
+    pub fn cpucycles_tracesetup();
+}
diff --git a/cpu-cycles/src/lib.rs b/cpu-cycles/src/lib.rs
new file mode 100644
index 0000000000..660545e602
--- /dev/null
+++ b/cpu-cycles/src/lib.rs
@@ -0,0 +1,82 @@
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+
+mod bindings;
+use bindings as c;
+
+use std::fmt;
+use std::{
+    error::Error,
+    ffi::{CStr, CString, IntoStringError},
+};
+
+#[derive(Debug)]
+pub struct CpuCyclesError {
+    message: String,
+}
+
+impl fmt::Display for CpuCyclesError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", self.message)
+    }
+}
+
+impl Error for CpuCyclesError {}
+
+pub fn cpucycles_tracesetup() {
+    unsafe { c::cpucycles_tracesetup() }
+}
+
+pub fn cpucycles() -> Result<i64, CpuCyclesError> {
+    if let Some(count) = unsafe { c::cpucycles.map(|f| f()) } {
+        Ok(count)
+    } else {
+        Err(CpuCyclesError {
+            message: "Could not execute cpucycles!".to_string(),
+        })
+    }
+}
+
+pub fn cpucycles_persecond() -> Result<i64, CpuCyclesError> {
+    Ok(unsafe { c::cpucycles_persecond() })
+}
+
+pub fn cpucycles_implementation() -> Result<String, IntoStringError> {
+    let implementation = unsafe { CString::from(CStr::from_ptr(c::cpucycles_implementation())) };
+    implementation.into_string()
+}
+
+pub fn cpucycles_version() -> Result<String, IntoStringError> {
+    let version = unsafe { CString::from(CStr::from_ptr(c::cpucycles_version())) };
+    version.into_string()
+}
+
+#[cfg(test)]
+mod test {
+    use crate::*;
+
+    #[test]
+    fn cpucycles_test() {
+        let count = cpucycles();
+        assert!(count.is_ok())
+    }
+
+    #[test]
+    fn cpucycles_persecond_test() {
+        let per_second = cpucycles_persecond();
+        assert!(per_second.is_ok());
+    }
+
+    #[test]
+    fn cpucycles_implementation_test() {
+        let implementation = cpucycles_implementation();
+        assert!(implementation.is_ok());
+    }
+
+    #[test]
+    fn cpucycles_version_test() {
+        let version = cpucycles_version();
+        assert!(version.is_ok());
+    }
+}
diff --git a/mixnode/Cargo.toml b/mixnode/Cargo.toml
index 1be77655b9..3a1a21b9af 100644
--- a/mixnode/Cargo.toml
+++ b/mixnode/Cargo.toml
@@ -29,32 +29,38 @@ log = { workspace = true }
 pretty_env_logger = "0.4.0"
 rand = "0.7.3"
 rocket = { version = "0.5.0-rc.2", features = ["json"] }
-serde = { version="1.0", features = ["derive"] }
+serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 sysinfo = "0.27.7"
-tokio = { version="1.21.2", features = ["rt-multi-thread", "net", "signal"] }
-tokio-util = { version="0.7.3", features = ["codec"] }
+tokio = { version = "1.21.2", features = ["rt-multi-thread", "net", "signal"] }
+tokio-util = { version = "0.7.3", features = ["codec"] }
 toml = "0.5.8"
 url = { version = "2.2", features = ["serde"] }
 atty = "0.2"
 
 ## internal
-nym-config = { path="../common/config" }
-nym-crypto = { path="../common/crypto" }
+nym-config = { path = "../common/config" }
+nym-crypto = { path = "../common/crypto" }
 nym-contracts-common = { path = "../common/cosmwasm-smart-contracts/contracts-common" }
-mixnet-client = { path="../common/client-libs/mixnet-client" }
-mixnode-common = { path="../common/mixnode-common" }
-nym-nonexhaustive-delayqueue = { path="../common/nonexhaustive-delayqueue" }
-nym-sphinx = { path="../common/nymsphinx" }
+mixnet-client = { path = "../common/client-libs/mixnet-client" }
+mixnode-common = { path = "../common/mixnode-common" }
+nym-nonexhaustive-delayqueue = { path = "../common/nonexhaustive-delayqueue" }
+nym-sphinx = { path = "../common/nymsphinx" }
 nym-pemstore = { path = "../common/pemstore", version = "0.2.0" }
 nym-task = { path = "../common/task" }
 nym-types = { path = "../common/types" }
-nym-topology = { path="../common/topology" }
-validator-client = { path="../common/client-libs/validator-client" }
-nym-bin-common = { path="../common/bin-common" }
+nym-topology = { path = "../common/topology" }
+validator-client = { path = "../common/client-libs/validator-client" }
+nym-bin-common = { path = "../common/bin-common" }
+cpu-cycles = { path = "../cpu-cycles", optional = true }
 
 [dev-dependencies]
-tokio = { version="1.21.2", features = ["rt-multi-thread", "net", "signal", "test-util"] }
+tokio = { version = "1.21.2", features = [
+    "rt-multi-thread",
+    "net",
+    "signal",
+    "test-util",
+] }
 
 nym-sphinx-types = { path = "../common/nymsphinx/types" }
 nym-sphinx-params = { path = "../common/nymsphinx/params" }
diff --git a/mixnode/src/main.rs b/mixnode/src/main.rs
index 92b28cb6b8..609e63a0ad 100644
--- a/mixnode/src/main.rs
+++ b/mixnode/src/main.rs
@@ -60,6 +60,11 @@ impl Cli {
     }
 }
 
+#[cfg(feature = "cpu-cycles")]
+pub fn cpu_cycles() {
+    info!("{}", cpu_cycles::cpucycles())
+}
+
 #[tokio::main]
 async fn main() {
     setup_logging();