Compare commits
6 Commits
create
...
libcpucycle
| Author | SHA1 | Date | |
|---|---|---|---|
| f838ddffe2 | |||
| 7f7c33d10b | |||
| e7fdd3d076 | |||
| 4dc89bd65f | |||
| 9aa3b9507d | |||
| 61e88f304b |
@@ -43,3 +43,4 @@ Cargo.lock
|
||||
nym-connect/Cargo.lock
|
||||
.parcel-cache
|
||||
**/.DS_Store
|
||||
cpu-cycles/libcpucycles/build
|
||||
Generated
+2
-2
@@ -2706,9 +2706,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.139"
|
||||
version = "0.2.140"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
|
||||
checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c"
|
||||
|
||||
[[package]]
|
||||
name = "libgit2-sys"
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "cpu-cycles"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
build = "build.rs"
|
||||
links = "cpucycles"
|
||||
|
||||
[dependencies]
|
||||
libc = "0.2.140"
|
||||
|
||||
[build-dependencies]
|
||||
cfg-if = "1"
|
||||
@@ -0,0 +1,65 @@
|
||||
use std::{env, path::PathBuf, process::Command};
|
||||
|
||||
fn main() {
|
||||
let out_dir = env::var("OUT_DIR").unwrap();
|
||||
let out_path = PathBuf::from(&out_dir);
|
||||
let source_path = PathBuf::from("libcpucycles")
|
||||
.canonicalize()
|
||||
.expect("cannot canonicalize path");
|
||||
|
||||
cfg_if::cfg_if! {
|
||||
if #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "mips", target_arch = "powerpc", target_arch = "powerpc64", target_arch = "arm")))] {
|
||||
panic!("Unsupported architecture - {}!", env::var("CARGO_CFG_TARGET_ARCH").unwrap(), )
|
||||
}
|
||||
};
|
||||
|
||||
let mut compile_o_command = Command::new("./configure");
|
||||
let compile_o_command = compile_o_command
|
||||
.current_dir(&source_path)
|
||||
.arg(format!("--prefix={out_dir}"));
|
||||
|
||||
match compile_o_command.output() {
|
||||
Ok(output) => {
|
||||
if !output.status.success() {
|
||||
panic!("{:?}", unsafe {
|
||||
std::str::from_utf8_unchecked(&output.stderr)
|
||||
})
|
||||
}
|
||||
}
|
||||
Err(e) => panic!("{e}"),
|
||||
}
|
||||
|
||||
let mut compile_o_command = Command::new("make");
|
||||
let compile_o_command = compile_o_command.current_dir(&source_path).arg("install");
|
||||
|
||||
match compile_o_command.output() {
|
||||
Ok(output) => {
|
||||
if !output.status.success() {
|
||||
panic!("{:?}", unsafe {
|
||||
std::str::from_utf8_unchecked(&output.stderr)
|
||||
})
|
||||
}
|
||||
}
|
||||
Err(e) => panic!("{e}"),
|
||||
}
|
||||
|
||||
println!(
|
||||
"cargo:rustc-link-search=native={}",
|
||||
out_path.join("lib").to_str().unwrap()
|
||||
);
|
||||
println!("cargo:rustc-link-lib=static=cpucycles");
|
||||
|
||||
let mut compile_o_command = Command::new("make");
|
||||
let compile_o_command = compile_o_command.current_dir(source_path).arg("clean");
|
||||
|
||||
match compile_o_command.output() {
|
||||
Ok(output) => {
|
||||
if !output.status.success() {
|
||||
panic!("{:?}", unsafe {
|
||||
std::str::from_utf8_unchecked(&output.stderr)
|
||||
})
|
||||
}
|
||||
}
|
||||
Err(e) => panic!("{e}"),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
default:
|
||||
cd build && $(MAKE)
|
||||
|
||||
install:
|
||||
cd build && $(MAKE) install
|
||||
|
||||
clean:
|
||||
cd build && $(MAKE) clean
|
||||
Executable
+69
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import datetime
|
||||
import markdown
|
||||
|
||||
def load(fn):
|
||||
with open(fn) as f:
|
||||
return f.read()
|
||||
|
||||
style = load('autogen/html-style')
|
||||
sitetitle = load('autogen/html-title')
|
||||
|
||||
files = []
|
||||
|
||||
with open('autogen/html-files') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
line = line.split(':')
|
||||
if len(line) != 3: continue
|
||||
files += [line]
|
||||
|
||||
for md,html,pagetitle in files:
|
||||
fnmd = 'doc/%s.md' % md
|
||||
fnhtml = 'doc/html/%s.html' % html
|
||||
output = ''
|
||||
|
||||
x = load(fnmd)
|
||||
x = markdown.markdown(x,extensions=['markdown.extensions.extra','markdown.extensions.tables'])
|
||||
mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(fnmd)).strftime('%Y.%m.%d')
|
||||
|
||||
output += '<html>\n<head>\n'
|
||||
output += style
|
||||
output += '<title>\n'
|
||||
output += pagetitle
|
||||
output += '</title>\n'
|
||||
output += '</head>\n'
|
||||
output += '<body>\n'
|
||||
|
||||
output += '<div class=headline>\n'
|
||||
output += sitetitle
|
||||
output += '</div>\n'
|
||||
|
||||
output += '<div class=nav>\n'
|
||||
for submd,subhtml,subpagetitle in files:
|
||||
if subhtml == html:
|
||||
output += '<div class="navt here">'
|
||||
output += pagetitle+'\n'
|
||||
else:
|
||||
output += '<div class="navt away">'
|
||||
output += '<a href=%s.html>%s</a>\n' % (subhtml,subpagetitle)
|
||||
output += '</div>'
|
||||
output += '</div>\n'
|
||||
|
||||
output += '<div class=main>\n'
|
||||
output += x
|
||||
output += '<hr><font size=1><b>Version:</b>\n'
|
||||
output += 'This is version %s of the "%s" web page.\n' % (mtime,pagetitle)
|
||||
output += '</font>\n'
|
||||
output += '</div>\n'
|
||||
|
||||
output += '</body>\n'
|
||||
output += '</html>\n'
|
||||
|
||||
if not os.path.exists(fnhtml) or output != load(fnhtml):
|
||||
with open(fnhtml+'.new','w') as f:
|
||||
f.write(output)
|
||||
os.chmod(fnhtml+'.new',0o444)
|
||||
os.rename(fnhtml+'.new',fnhtml)
|
||||
@@ -0,0 +1,7 @@
|
||||
readme:index:Intro
|
||||
download:download:Download
|
||||
install:install:Install
|
||||
api:api:API
|
||||
counters:counters:Counters
|
||||
selection:selection:Selection
|
||||
security:security:Security
|
||||
@@ -0,0 +1,32 @@
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style type="text/css">
|
||||
html{overflow-y:scroll}
|
||||
body{font-family:sans-serif}
|
||||
p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
|
||||
li p{font-size:1.0em}
|
||||
blockquote p{font-size:1.0em}
|
||||
tt{font-size:1.2em}
|
||||
code{font-size:1.2em}
|
||||
h1{font-size:1.5em}
|
||||
h2{font-size:1.3em}
|
||||
h3{font-size:1.0em}
|
||||
h1 a{text-decoration:none}
|
||||
table{border-collapse:collapse}
|
||||
th,td{border:1px solid black}
|
||||
table a{text-decoration:none}
|
||||
table tr{font-size:0.9em;line-height:1.6em}
|
||||
.links a:hover{text-decoration:underline}
|
||||
.links a:active{text-decoration:underline}
|
||||
.links img{width:200px;padding-left:1em}
|
||||
.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
|
||||
.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
|
||||
.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
|
||||
min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
|
||||
font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
|
||||
.here{border-bottom:0px;background-color:#ffffff}
|
||||
.away{background-color:#125d0d;}
|
||||
.away a{text-decoration:none;display:block;color:#ffffff}
|
||||
.away a:hover,.away a:active{text-decoration:underline}
|
||||
.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
|
||||
</style>
|
||||
@@ -0,0 +1 @@
|
||||
libcpucycles
|
||||
Executable
+3
@@ -0,0 +1,3 @@
|
||||
#!/bin/sh
|
||||
|
||||
pandoc --standalone --to man --metadata title=cpucycles --metadata section=3 < doc/api.md > doc/man/cpucycles.3
|
||||
@@ -0,0 +1,93 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/types.h>
|
||||
#include <cpucycles.h>
|
||||
|
||||
#define TIMINGS 63
|
||||
static long long t[TIMINGS+1];
|
||||
|
||||
static void t_print(void)
|
||||
{
|
||||
long long median = 0;
|
||||
long long i,j;
|
||||
|
||||
for (i = 0;i < TIMINGS;++i)
|
||||
t[i] = t[i+1]-t[i];
|
||||
for (j = 0;j < TIMINGS;++j) {
|
||||
long long belowj = 0;
|
||||
long long abovej = 0;
|
||||
for (i = 0;i < TIMINGS;++i) if (t[i] < t[j]) ++belowj;
|
||||
for (i = 0;i < TIMINGS;++i) if (t[i] > t[j]) ++abovej;
|
||||
if (belowj*2 < TIMINGS && abovej*2 < TIMINGS) {
|
||||
median = t[j];
|
||||
break;
|
||||
}
|
||||
}
|
||||
printf(" %lld ",median);
|
||||
for (i = 0;i < TIMINGS;++i)
|
||||
printf("%+lld",t[i]-median);
|
||||
printf("\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
static long long microseconds(void)
|
||||
{
|
||||
struct timeval t;
|
||||
long long result;
|
||||
gettimeofday(&t,(struct timezone *) 0);
|
||||
result = t.tv_sec;
|
||||
result *= 1000000;
|
||||
result += t.tv_usec;
|
||||
return result;
|
||||
}
|
||||
|
||||
static volatile int v;
|
||||
|
||||
static void measure_cpucycles(void)
|
||||
{
|
||||
long long loops,i,j;
|
||||
|
||||
printf("cpucycles persecond %lld\n",cpucycles_persecond());
|
||||
printf("cpucycles implementation %s\n",cpucycles_implementation());
|
||||
|
||||
for (i = 0;i <= TIMINGS;++i)
|
||||
t[i] = cpucycles();
|
||||
printf("cpucycles median"); t_print();
|
||||
|
||||
for (loops = 1024;loops <= 1048576;loops *= 2) {
|
||||
long long t00,t01,t10,t11;
|
||||
long long m0,m1;
|
||||
double ratiobelow,ratioabove;
|
||||
|
||||
t00 = cpucycles();
|
||||
m0 = microseconds();
|
||||
t01 = cpucycles();
|
||||
|
||||
for (j = 0;j < loops;++j) v = 0;
|
||||
|
||||
t10 = cpucycles();
|
||||
m1 = microseconds();
|
||||
t11 = cpucycles();
|
||||
|
||||
if (t01 < t00) continue;
|
||||
if (t10 < t01) continue;
|
||||
if (t11 < t10) continue;
|
||||
if (m1 <= m0+2) continue;
|
||||
|
||||
ratiobelow = floor((1000000.0*(t10-t01))/(m1+1-m0));
|
||||
ratioabove = ceil((1000000.0*(t11-t00))/(m1-m0-1));
|
||||
|
||||
printf("cpucycles observed persecond %.0lf...%.0lf with %lld loops %lld microseconds\n",ratiobelow,ratioabove,loops,m1-m0);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc,char **argv)
|
||||
{
|
||||
cpucycles_tracesetup();
|
||||
printf("cpucycles version %s\n",cpucycles_version());
|
||||
measure_cpucycles();
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
gcc -Wall -fPIC -fwrapv -O -fvisibility=hidden
|
||||
clang -Wall -fPIC -fwrapv -Qunused-arguments -O -fvisibility=hidden
|
||||
+309
@@ -0,0 +1,309 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import platform
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
prefix = '/usr/local'
|
||||
clean = True
|
||||
linktype = 'so'
|
||||
|
||||
host = platform.machine()
|
||||
host = ''.join(c for c in host if c in '_0123456789abcdefghijklmnopqrstuvwxyz')
|
||||
|
||||
if host == 'x86_64': host = 'amd64'
|
||||
if host == 'i386': host = 'x86'
|
||||
if host == 'i686': host = 'x86'
|
||||
|
||||
if host.startswith('armv8') or host.startswith('aarch64'): host = 'arm64'
|
||||
if host.startswith('arm'): host = 'arm32'
|
||||
|
||||
if host.startswith('riscv64'): host = 'riscv64'
|
||||
if host.startswith('riscv'): host = 'riscv32'
|
||||
|
||||
if host.startswith('mips64'): host = 'mips64'
|
||||
if host.startswith('mips'): host = 'mips32'
|
||||
|
||||
if host.startswith('powerpc64') or host.startswith('ppc64'): host = 'ppc64'
|
||||
if host.startswith('powerpc') or host.startswith('ppc'): host = 'ppc32'
|
||||
|
||||
if host.startswith('sparcv9') or host.startswith('sun4u'): host = 'sparc64'
|
||||
if host.startswith('sparc') or host.startswith('sun'): host = 'sparc32'
|
||||
|
||||
makefile = ''
|
||||
|
||||
for arg in sys.argv[1:]:
|
||||
if arg.startswith('--prefix='):
|
||||
prefix = arg[9:]
|
||||
continue
|
||||
if arg.startswith('--host='):
|
||||
host = arg[7:]
|
||||
host = host.split('-')[0]
|
||||
continue
|
||||
if arg == '--clean':
|
||||
clean = True
|
||||
continue
|
||||
if arg == '--noclean':
|
||||
clean = False
|
||||
continue
|
||||
raise ValueError('unrecognized argument %s' % arg)
|
||||
|
||||
echoargs = './configure'
|
||||
echoargs += ' --prefix=%s' % prefix
|
||||
echoargs += ' --host=%s' % host
|
||||
if clean: echoargs += ' --clean'
|
||||
if not clean: echoargs += ' --noclean'
|
||||
print(echoargs)
|
||||
|
||||
if prefix[0] != '/':
|
||||
raise ValueError('prefix %s is not an absolute path' % prefix)
|
||||
|
||||
rpath = None
|
||||
# XXX: rpath = '%s/lib' % prefix
|
||||
|
||||
if clean:
|
||||
shutil.rmtree('build/%s' % host,ignore_errors=True)
|
||||
|
||||
def dirlinksym(dir,source,target):
|
||||
with tempfile.TemporaryDirectory(dir=dir) as t:
|
||||
os.symlink(target,'%s/symlink' % t)
|
||||
os.rename('%s/symlink' % t,'%s/%s' % (dir,source))
|
||||
|
||||
os.makedirs('build/%s' % host,exist_ok=True)
|
||||
os.makedirs('build/%s/package/bin' % host,exist_ok=True)
|
||||
os.makedirs('build/%s/package/lib' % host,exist_ok=True)
|
||||
os.makedirs('build/%s/package/include' % host,exist_ok=True)
|
||||
|
||||
if clean:
|
||||
os.symlink('../..','build/%s/src' % host)
|
||||
|
||||
# ----- build scripts
|
||||
|
||||
os.makedirs('build/%s/scripts'%host,exist_ok=True)
|
||||
dirlinksym('build/%s/scripts'%host,'install','../src/scripts-build/install')
|
||||
|
||||
# ----- compilers
|
||||
|
||||
def compilerversion(c):
|
||||
try:
|
||||
p = subprocess.Popen(c.split()+['--version'],stdout=subprocess.PIPE,stderr=subprocess.STDOUT,universal_newlines=True)
|
||||
out,err = p.communicate()
|
||||
assert not err
|
||||
assert not p.returncode
|
||||
return out
|
||||
except:
|
||||
pass
|
||||
|
||||
firstcompiler = None
|
||||
|
||||
with open('compilers/default') as f:
|
||||
for c in f.readlines():
|
||||
c = c.strip()
|
||||
cv = compilerversion(c)
|
||||
if cv == None:
|
||||
print('skipping default compiler %s' % c)
|
||||
continue
|
||||
print('using default compiler %s' % c)
|
||||
firstcompiler = c
|
||||
break
|
||||
|
||||
if firstcompiler is None:
|
||||
raise ValueError('did not find a working compiler')
|
||||
|
||||
with open('build/%s/scripts/compiledefault' % host,'w') as f:
|
||||
f.write('#!/bin/sh\n')
|
||||
f.write('\n')
|
||||
f.write('dir="$1"; shift\n')
|
||||
f.write('base="$1"; shift\n')
|
||||
f.write('ext="$1"; shift\n')
|
||||
f.write('\n')
|
||||
f.write('cd "$dir" && \\\n')
|
||||
f.write('%s \\\n' % firstcompiler)
|
||||
f.write(' "$@" \\\n')
|
||||
f.write(' -c "$base.$ext"\n')
|
||||
os.chmod('build/%s/scripts/compiledefault' % host,0o755)
|
||||
|
||||
# ----- libcpucycles
|
||||
|
||||
os.makedirs('build/%s/cpucycles' % host,exist_ok=True)
|
||||
os.makedirs('build/%s/package/man/man3' % host,exist_ok=True)
|
||||
|
||||
dirlinksym('build/%s/cpucycles'%host,'cpucycles.h','../src/cpucycles/cpucycles.h')
|
||||
dirlinksym('build/%s/cpucycles'%host,'cpucycles_internal.h','../src/cpucycles/cpucycles_internal.h')
|
||||
shutil.copy2('cpucycles/cpucycles.h','build/%s/package/include/cpucycles.h'%host)
|
||||
shutil.copy2('doc/man/cpucycles.3','build/%s/package/man/man3/cpucycles.3'%host)
|
||||
|
||||
with open('build/%s/cpucycles/compile-ticks' % host,'w') as f:
|
||||
f.write('#!/bin/sh\n')
|
||||
f.write('arch="$1"; shift\n')
|
||||
f.write('x="$1"; shift\n')
|
||||
f.write('for source in try-"$arch"-"$x".c try-default-zero.c\n')
|
||||
f.write('do\n')
|
||||
f.write(' cp "$source" "$arch"-"$x".c\n')
|
||||
f.write(' %s \\\n' % firstcompiler)
|
||||
f.write(' -Dticks=cpucycles_ticks_"$arch"_"$x" \\\n')
|
||||
f.write(' -Dticks_setup=cpucycles_ticks_"$arch"_"$x"_setup \\\n')
|
||||
f.write(' -c "$arch"-"$x".c\n')
|
||||
f.write(' case $? in\n')
|
||||
f.write(' 0) break ;;\n')
|
||||
f.write(' 111) exit 111 ;;\n')
|
||||
f.write(' *) echo "skipping option that did not compile" ;;\n')
|
||||
f.write(' esac\n')
|
||||
f.write('done\n')
|
||||
os.chmod('build/%s/cpucycles/compile-ticks' % host,0o755)
|
||||
|
||||
cpucyclesoptions = []
|
||||
cpucyclesofiles = []
|
||||
|
||||
with open('cpucycles/options') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line == '': continue
|
||||
if line[0] == '#': continue
|
||||
base = line.split()[0]
|
||||
if not os.path.exists('cpucycles/%s.c' % base): continue
|
||||
cpucycles = base.split('-')
|
||||
if len(cpucycles) != 2: continue
|
||||
if cpucycles[0] not in (host,'default'): continue
|
||||
cpucyclesoptions += [cpucycles]
|
||||
|
||||
cpucyclesoptions += [['default','zero']] # must be last
|
||||
|
||||
for cpucycles in cpucyclesoptions:
|
||||
base = '-'.join(cpucycles)
|
||||
cpucyclesofiles += ['cpucycles/%s.o' % base]
|
||||
dirlinksym('build/%s/cpucycles'%host,'try-%s.c'%base,'../src/cpucycles/%s.c'%base)
|
||||
M = 'cpucycles/%s.o: cpucycles/try-%s.c cpucycles/try-default-zero.c\n' % (base,base)
|
||||
M += '\tcd cpucycles && ./compile-ticks %s %s\n' % tuple(cpucycles)
|
||||
M += '\n'
|
||||
makefile = M + makefile
|
||||
|
||||
for fn in sorted(os.listdir('cpucycles')):
|
||||
if not fn.endswith('.c'): continue
|
||||
if '-' in fn: continue
|
||||
base = fn[:-2]
|
||||
cpucyclesofiles += ['cpucycles/%s.o' % base]
|
||||
dirlinksym('build/%s/cpucycles'%host,fn,'../src/cpucycles/%s'%fn)
|
||||
M = 'cpucycles/%s.o: cpucycles/%s.c\n' % (base,base)
|
||||
M += '\tscripts/compiledefault cpucycles %s c\n' % base
|
||||
M += '\n'
|
||||
makefile = M + makefile
|
||||
|
||||
with open('build/%s/cpucycles/options.inc' % host,'w') as f:
|
||||
f.write('#define NUMOPTIONS %d\n' % len(cpucyclesoptions))
|
||||
f.write('#define DEFAULTOPTION (NUMOPTIONS-1)\n')
|
||||
f.write('\n')
|
||||
for cpucycles in cpucyclesoptions:
|
||||
f.write('extern long long cpucycles_ticks_%s_%s_setup(void);\n' % (cpucycles[0],cpucycles[1]))
|
||||
f.write('extern long long cpucycles_ticks_%s_%s(void);\n' % (cpucycles[0],cpucycles[1]))
|
||||
f.write('\n')
|
||||
f.write('static struct {\n')
|
||||
f.write(' const char *implementation;\n')
|
||||
f.write(' long long (*ticks_setup)(void);\n')
|
||||
f.write(' long long (*ticks)(void);\n')
|
||||
f.write('} options[NUMOPTIONS] = {\n')
|
||||
for cpucycles in cpucyclesoptions:
|
||||
f.write('{ "%s-%s", cpucycles_ticks_%s_%s_setup, cpucycles_ticks_%s_%s },\n' % (cpucycles[0],cpucycles[1],cpucycles[0],cpucycles[1],cpucycles[0],cpucycles[1]))
|
||||
f.write('} ;\n')
|
||||
|
||||
dirlinksym('build/%s/scripts'%host,'staticlib','../src/scripts-build/staticlib')
|
||||
|
||||
M = 'package/lib/libcpucycles.a: scripts/staticlib %s\n' % ' '.join(cpucyclesofiles)
|
||||
M += '\tscripts/staticlib %s\n' % ' '.join(cpucyclesofiles)
|
||||
M += '\n'
|
||||
makefile = M + makefile
|
||||
|
||||
with open('build/%s/scripts/sharedlib' % host,'w') as f:
|
||||
f.write('#!/bin/sh\n')
|
||||
f.write('\n')
|
||||
f.write('%s -shared \\\n' % firstcompiler)
|
||||
if rpath:
|
||||
f.write(' -Wl,-rpath=%s \\\n' % rpath)
|
||||
f.write(' -Wl,-soname,libcpucycles.so.1 \\\n')
|
||||
f.write(' -o package/lib/libcpucycles.so.1 \\\n')
|
||||
f.write(' "$@"\n')
|
||||
f.write('chmod 644 package/lib/libcpucycles.so.1\n')
|
||||
os.chmod('build/%s/scripts/sharedlib' % host,0o755)
|
||||
|
||||
M = 'package/lib/libcpucycles.so.1: scripts/sharedlib %s\n' % ' '.join(cpucyclesofiles)
|
||||
M += '\tscripts/sharedlib %s\n' % ' '.join(cpucyclesofiles)
|
||||
M += '\n'
|
||||
makefile = M + makefile
|
||||
|
||||
M = 'package/lib/libcpucycles.so: package/lib/libcpucycles.so.1\n'
|
||||
M += '\trm -f package/lib/libcpucycles.so\n'
|
||||
M += '\tln -s libcpucycles.so.1 package/lib/libcpucycles.so\n'
|
||||
M += '\n'
|
||||
makefile = M + makefile
|
||||
|
||||
# ----- command
|
||||
|
||||
os.makedirs('build/%s/command'%host)
|
||||
for c in sorted(os.listdir('command')):
|
||||
dirlinksym('build/%s/command'%host,c,'../src/command/%s'%c)
|
||||
dirlinksym('build/%s/command'%host,'bin','../package/bin')
|
||||
dirlinksym('build/%s/command'%host,'lib','../package/lib')
|
||||
dirlinksym('build/%s/command'%host,'include','../package/include')
|
||||
|
||||
with open('build/%s/command/link' % host,'w') as f:
|
||||
f.write('#!/bin/sh\n')
|
||||
f.write('target="$1"; shift\n')
|
||||
f.write('%s \\\n' % firstcompiler)
|
||||
f.write(' -o "$target" "$@"\n')
|
||||
os.chmod('build/%s/command/link' % host,0o755)
|
||||
|
||||
commands = []
|
||||
|
||||
for fn in sorted(os.listdir('command')):
|
||||
if not fn.endswith('.c'): continue
|
||||
|
||||
libs = ['libcpucycles']
|
||||
|
||||
base = fn[:-2]
|
||||
M = 'command/%s.o: command/%s.c\n' % (base,base)
|
||||
M += '\tscripts/compiledefault command %s c -I include\n' % base
|
||||
M += '\n'
|
||||
makefile = M + makefile
|
||||
M = 'package/bin/%s: command/%s.o%s\n' % (base,base,''.join(' package/lib/%s.%s' % (x,linktype) for x in libs))
|
||||
M += '\tcd command && ./link bin/%s %s.o%s -lm -lrt\n' % (base,base,''.join(' lib/%s.%s' % (x,linktype) for x in libs))
|
||||
M += '\n'
|
||||
makefile = M + makefile
|
||||
commands += ['package/bin/%s' % base]
|
||||
|
||||
M = 'commands: %s\n' % ' '.join(commands)
|
||||
M += '\n'
|
||||
makefile = M + makefile
|
||||
|
||||
# ----- make install
|
||||
|
||||
M = 'install: scripts/install default\n'
|
||||
M += '\tscripts/install %s\n' % prefix
|
||||
M += '\n'
|
||||
makefile = M + makefile
|
||||
|
||||
# ----- make default
|
||||
|
||||
M = 'default: package/lib/libcpucycles.a package/lib/libcpucycles.so package/lib/libcpucycles.so.1 \\\n'
|
||||
M += 'commands\n'
|
||||
M += '\n'
|
||||
makefile = M + makefile
|
||||
|
||||
with open('build/%s/Makefile' % host,'w') as f:
|
||||
f.write(makefile)
|
||||
|
||||
# ----- build/0, build/Makefile
|
||||
|
||||
dirlinksym('build','0',host)
|
||||
|
||||
with open('build/Makefile','w') as f:
|
||||
f.write('default:\n')
|
||||
f.write('\tcd %s && $(MAKE)\n' % host)
|
||||
f.write('\n')
|
||||
f.write('install:\n')
|
||||
f.write('\tcd %s && $(MAKE) install\n' % host)
|
||||
f.write('\n')
|
||||
f.write('clean:\n')
|
||||
f.write('\trm -r %s\n' % host)
|
||||
@@ -0,0 +1,53 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
// adapted from supercop/cpucycles/amd64rdpmc.c
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
static struct perf_event_attr attr;
|
||||
static int fdperf = -1;
|
||||
static struct perf_event_mmap_page *buf = 0;
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
long long result;
|
||||
unsigned int seq;
|
||||
long long index;
|
||||
long long offset;
|
||||
|
||||
do {
|
||||
seq = buf->lock;
|
||||
asm volatile("" ::: "memory");
|
||||
index = buf->index;
|
||||
offset = buf->offset;
|
||||
asm volatile("rdpmc;shlq $32,%%rdx;orq %%rdx,%%rax"
|
||||
: "=a"(result) : "c"(index-1) : "%rdx");
|
||||
asm volatile("" ::: "memory");
|
||||
} while (buf->lock != seq);
|
||||
|
||||
result += offset;
|
||||
result &= 0xffffffffffff;
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (fdperf == -1) {
|
||||
attr.type = PERF_TYPE_HARDWARE;
|
||||
attr.config = PERF_COUNT_HW_CPU_CYCLES;
|
||||
attr.exclude_kernel = 1;
|
||||
fdperf = syscall(__NR_perf_event_open,&attr,0,-1,-1,0);
|
||||
if (fdperf == -1) return cpucycles_SKIP;
|
||||
buf = mmap(NULL,sysconf(_SC_PAGESIZE),PROT_READ,MAP_SHARED,fdperf,0);
|
||||
}
|
||||
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_CYCLECOUNTER;
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
return __rdtsc();
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_MAYBECYCLECOUNTER;
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
// adapted from supercop/cpucycles/amd64tscfreq.c
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
unsigned long long result;
|
||||
asm volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
|
||||
: "=a"(result) :: "%rdx");
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_MAYBECYCLECOUNTER;
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
// adapted from supercop/cpucycles/cortex.c
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
unsigned int result;
|
||||
asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(result));
|
||||
return (unsigned long long) result;
|
||||
}
|
||||
|
||||
static long enable(void)
|
||||
{
|
||||
asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(17));
|
||||
asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(0x8000000f));
|
||||
asm volatile("mcr p15, 0, %0, c9, c12, 3" :: "r"(0x8000000f));
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(enable)) return cpucycles_SKIP;
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_EXTEND32;
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
// adapted from supercop/cpucycles/armv8.c
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
long long result;
|
||||
asm volatile("mrs %0, PMCCNTR_EL0" : "=r" (result));
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_CYCLECOUNTER;
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
// adapted from supercop/cpucycles/vct.c
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
long long result;
|
||||
asm volatile("mrs %0, CNTVCT_EL0" : "=r" (result));
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_FINDMULTIPLIER;
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
// version 20230115
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
// 20230115 djb: cpucycles_version()
|
||||
// 20230114 djb: improve punctuation
|
||||
|
||||
#ifndef cpucycles_h
|
||||
#define cpucycles_h
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern long long (*cpucycles)(void) __attribute__((visibility("default")));
|
||||
extern const char *cpucycles_implementation(void) __attribute__((visibility("default")));
|
||||
extern const char *cpucycles_version(void) __attribute__((visibility("default")));
|
||||
extern long long cpucycles_persecond(void) __attribute__((visibility("default")));
|
||||
extern void cpucycles_tracesetup(void) __attribute__((visibility("default")));
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,20 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
#ifndef cpucycles_internal_h
|
||||
#define cpucycles_internal_h
|
||||
|
||||
extern long long cpucycles_init(void);
|
||||
extern long long cpucycles_microseconds(void);
|
||||
extern int cpucycles_works(long long (*)(void));
|
||||
|
||||
// return values from ticks_setup():
|
||||
#define cpucycles_SKIP (0)
|
||||
#define cpucycles_CYCLECOUNTER (-1)
|
||||
#define cpucycles_MAYBECYCLECOUNTER (-2)
|
||||
#define cpucycles_FINDMULTIPLIER (-3)
|
||||
#define cpucycles_EXTEND32 (-32)
|
||||
// and positive values mean known ticks/second
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,15 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
return 1000000;
|
||||
}
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
return cpucycles_microseconds();
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
#include <mach/mach_time.h>
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
return mach_absolute_time();
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_FINDMULTIPLIER;
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
// adapted from supercop/cpucycles/monotonic.c
|
||||
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
return 1000000000;
|
||||
}
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
struct timespec t;
|
||||
long long result;
|
||||
clock_gettime(CLOCK_MONOTONIC,&t);
|
||||
result = t.tv_sec;
|
||||
result *= 1000000000;
|
||||
result += t.tv_nsec;
|
||||
return result;
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
// version 20230106
|
||||
// public domain
|
||||
// djb
|
||||
// adapted from supercop/cpucycles/perfevent.c
|
||||
|
||||
// 20230106 djb: read() into int64_t instead of long long
|
||||
// 20230106 djb: add comment on RUNNING/ENABLED
|
||||
|
||||
/*
|
||||
This code intentionally avoids dividing by the
|
||||
PERF_FORMAT_TOTAL_TIME_RUNNING/ENABLED ratio.
|
||||
|
||||
The motivation for that ratio is as follows:
|
||||
|
||||
* A typical CPU has a limited number of performance-monitoring
|
||||
counters active at once. For example, there are 8 "programmable"
|
||||
counters on Intel Skylake.
|
||||
|
||||
* "perf stat" allows the user to enable more counters. The OS kernel
|
||||
periodically (e.g., every millisecond) changes the limited number of
|
||||
active hardware counters to a new subset of the enabled counters, and
|
||||
"perf stat" reports PERF_FORMAT_TOTAL_TIME_RUNNING/ENABLED for each
|
||||
counter, the fraction of time spent with that counter running.
|
||||
|
||||
For long-running programs, dividing the hardware counter by
|
||||
RUNNING/ENABLED usually produces a reasonable estimate of what the count
|
||||
would have been without competition from other counters.
|
||||
|
||||
A fixable problem with this multiplexing of counters is that the kernel
|
||||
appears to simply cycle through counters, so unlucky programs can
|
||||
trigger moiré effects. The fix is to select random subsets of counters.
|
||||
|
||||
A more fundamental problem is that cpucycles() has to be usable for
|
||||
timing short subroutines, including subroutines so short that the OS has
|
||||
no opportunity to change from one selection of counters to another. Say
|
||||
RUNNING is 0; should cpucycles() then divide by 0?
|
||||
|
||||
If a caller runs cpucycles(), X(), cpucycles(), X(), etc., and the cycle
|
||||
counter happens to be enabled for only 80% of the runs of X(), then
|
||||
simply computing the median difference of adjacent cycle counts, with no
|
||||
scaling, will filter out the zeros and correctly compute the cost of X.
|
||||
Averages won't (without scaling), but averages have other problems, such
|
||||
as being heavily influenced by interrupts. (Omitting kernel time from
|
||||
perf results does not remove the influence of interrupts on caches.)
|
||||
|
||||
Given the importance of cycle counting, it is better to have cycle
|
||||
counters always running. For example, on Skylake, Intel provides the 8
|
||||
"programmable" counters on top of a separate cycle counter ("fixed
|
||||
counter 1"), so there is no good reason for the kernel to waste a
|
||||
"programmable" counter on a cycle counter, there is no good reason to
|
||||
turn the cycle counter off, and there is no good reason for RUNNING to
|
||||
be below ENABLED for the cycle counter.
|
||||
|
||||
Of course, applications that use just one performance counter at a time
|
||||
don't have to worry about kernels getting this wrong, and don't have to
|
||||
worry about the possibility of getting noisy or invalid results on CPUs
|
||||
that have heavier constraints on the number of simultaneous counters.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
static int fddev = -1;
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
int64_t result;
|
||||
|
||||
if (read(fddev,&result,sizeof result) < sizeof result) return 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (fddev == -1) {
|
||||
static struct perf_event_attr attr;
|
||||
|
||||
memset(&attr,0,sizeof attr);
|
||||
attr.type = PERF_TYPE_HARDWARE;
|
||||
attr.size = sizeof(struct perf_event_attr);
|
||||
attr.config = PERF_COUNT_HW_CPU_CYCLES;
|
||||
attr.disabled = 1;
|
||||
attr.exclude_kernel = 1;
|
||||
attr.exclude_hv = 1;
|
||||
|
||||
fddev = syscall(__NR_perf_event_open,&attr,0,-1,-1,0);
|
||||
if (fddev == -1) return cpucycles_SKIP;
|
||||
|
||||
ioctl(fddev,PERF_EVENT_IOC_RESET,0);
|
||||
ioctl(fddev,PERF_EVENT_IOC_ENABLE,0);
|
||||
}
|
||||
|
||||
return cpucycles_MAYBECYCLECOUNTER;
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
return cpucycles_SKIP;
|
||||
}
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
// adapted from supercop/cpucycles/mips.c
|
||||
|
||||
// mips32 release 2 instruction rdhwr
|
||||
// 7c02103b: read hwr#2 (cycle count) into $2
|
||||
// 7c02183b: read hwr#3 (cycle-count multiplier) into $2
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
static unsigned int multiplier = 0;
|
||||
|
||||
static long long multiplier_set(void)
|
||||
{
|
||||
asm volatile(".long 0x7c02183b; move %0,$2" : "=r"(multiplier) : : "$2");
|
||||
return multiplier;
|
||||
}
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
unsigned int result;
|
||||
asm volatile(".long 0x7c02103b; move %0,$2" : "=r"(result) :: "$2");
|
||||
result *= multiplier;
|
||||
return (unsigned long long) result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(multiplier_set)) return cpucycles_SKIP;
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_EXTEND32;
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
amd64-pmc
|
||||
amd64-tsc
|
||||
amd64-tscasm
|
||||
arm32-cortex
|
||||
arm64-pmc
|
||||
arm64-vct
|
||||
mips64-cc
|
||||
ppc32-mftb
|
||||
ppc64-mftb
|
||||
riscv32-rdcycle
|
||||
riscv64-rdcycle
|
||||
s390x-stckf
|
||||
sparc64-rdtick
|
||||
x86-tsc
|
||||
x86-tscasm
|
||||
default-perfevent
|
||||
default-mach
|
||||
default-monotonic
|
||||
default-gettimeofday
|
||||
@@ -0,0 +1,30 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
// adapted from supercop/cpucycles/powerpccpuinfo.c
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
unsigned int high, low, newhigh;
|
||||
unsigned long long result;
|
||||
|
||||
do {
|
||||
asm volatile(
|
||||
"mftbu %0; mftb %1; mftbu %2"
|
||||
: "=r" (high), "=r" (low), "=r" (newhigh)
|
||||
);
|
||||
} while (newhigh != high);
|
||||
|
||||
result = high;
|
||||
result <<= 32;
|
||||
result |= low;
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_FINDMULTIPLIER;
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
// adapted from supercop/cpucycles/powerpccpuinfo.c
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
unsigned int high, low, newhigh;
|
||||
unsigned long long result;
|
||||
|
||||
do {
|
||||
asm volatile(
|
||||
"mftbu %0; mftb %1; mftbu %2"
|
||||
: "=r" (high), "=r" (low), "=r" (newhigh)
|
||||
);
|
||||
} while (newhigh != high);
|
||||
|
||||
result = high;
|
||||
result <<= 32;
|
||||
result |= low;
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_FINDMULTIPLIER;
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
// adapted from supercop/cpucycles/riscv.c
|
||||
// which has code from djb and Romain Dolbeau
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
#ifndef __riscv_xlen
|
||||
#error this code is only for riscv platforms
|
||||
#endif
|
||||
|
||||
#if __riscv_xlen != 32
|
||||
#error this code is only for riscv32 platforms
|
||||
#endif
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
unsigned int low, high, newhigh;
|
||||
unsigned long long result;
|
||||
|
||||
asm volatile( "start%=:\n"
|
||||
"rdcycleh %0\n"
|
||||
"rdcycle %1\n"
|
||||
"rdcycleh %2\n"
|
||||
"bne %0, %2, start%=\n"
|
||||
: "=r"(high), "=r"(low), "=r"(newhigh));
|
||||
result = high;
|
||||
result <<= 32;
|
||||
result |= low;
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_CYCLECOUNTER;
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
// adapted from supercop/cpucycles/riscv.c
|
||||
// which has code from djb and Romain Dolbeau
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
#ifndef __riscv_xlen
|
||||
#error this code is only for riscv platforms
|
||||
#endif
|
||||
|
||||
#if __riscv_xlen != 64
|
||||
#error this code is only for riscv64 platforms
|
||||
#endif
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
long long result;
|
||||
asm volatile("rdcycle %0" : "=r" (result));
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_CYCLECOUNTER;
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
// version 20230106
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
// adapted from sparc64-rdtick.c
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
long long result;
|
||||
asm volatile("stckf 0(%0)" :: "a"(&result) : "memory","cc");
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return 4096000000; // manual says 2^12 per microsecond
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
// adapted from supercop/cpucycles/sparccpuinfo.c
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
#if defined(__sparcv8) || defined(__sparcv8plus)
|
||||
#error this code is only for sparc64 platforms
|
||||
#endif
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
long long result;
|
||||
asm volatile("rd %%tick,%0" : "=r" (result));
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_CYCLECOUNTER;
|
||||
}
|
||||
@@ -0,0 +1,420 @@
|
||||
// version 20230115
|
||||
// public domain
|
||||
// djb
|
||||
// includes some pieces adapted from supercop
|
||||
|
||||
// 20230115 djb: cpucycles_version()
|
||||
// 20230106 djb: support "cpu MHz static" (ibm z15)
|
||||
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <inttypes.h>
|
||||
#include <signal.h>
|
||||
#include <setjmp.h>
|
||||
#include "cpucycles.h"
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
static int tracesetup = 0;
|
||||
|
||||
void cpucycles_tracesetup(void)
|
||||
{
|
||||
tracesetup = 1;
|
||||
}
|
||||
|
||||
static jmp_buf crash_jmp;
|
||||
|
||||
static void crash(int s)
|
||||
{
|
||||
siglongjmp(crash_jmp,0);
|
||||
}
|
||||
|
||||
int cpucycles_works(long long (*ticks)(void))
|
||||
{
|
||||
volatile int result = 0;
|
||||
struct sigaction old_sigill;
|
||||
struct sigaction old_sigfpe;
|
||||
struct sigaction old_sigbus;
|
||||
struct sigaction old_sigsegv;
|
||||
struct sigaction crash_action;
|
||||
|
||||
memset(&crash_action,0,sizeof crash_action);
|
||||
crash_action.sa_handler = crash;
|
||||
|
||||
sigaction(SIGILL,0,&old_sigill);
|
||||
sigaction(SIGFPE,0,&old_sigfpe);
|
||||
sigaction(SIGBUS,0,&old_sigbus);
|
||||
sigaction(SIGSEGV,0,&old_sigsegv);
|
||||
|
||||
if (!sigsetjmp(crash_jmp,1)) {
|
||||
sigaction(SIGILL,&crash_action,0);
|
||||
sigaction(SIGFPE,&crash_action,0);
|
||||
sigaction(SIGBUS,&crash_action,0);
|
||||
sigaction(SIGSEGV,&crash_action,0);
|
||||
ticks();
|
||||
result = 1;
|
||||
}
|
||||
|
||||
sigaction(SIGILL,&old_sigill,0);
|
||||
sigaction(SIGFPE,&old_sigfpe,0);
|
||||
sigaction(SIGBUS,&old_sigbus,0);
|
||||
sigaction(SIGSEGV,&old_sigsegv,0);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static double osfreq(void)
|
||||
{
|
||||
FILE *f;
|
||||
char *x;
|
||||
double result;
|
||||
int s;
|
||||
|
||||
f = fopen("/etc/cpucyclespersecond", "r");
|
||||
if (f) {
|
||||
s = fscanf(f,"%lf",&result);
|
||||
fclose(f);
|
||||
if (s > 0) return result;
|
||||
}
|
||||
|
||||
f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed", "r");
|
||||
if (f) {
|
||||
s = fscanf(f,"%lf",&result);
|
||||
fclose(f);
|
||||
if (s > 0) return 1000.0 * result;
|
||||
}
|
||||
|
||||
f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", "r");
|
||||
if (f) {
|
||||
s = fscanf(f,"%lf",&result);
|
||||
fclose(f);
|
||||
if (s > 0) return 1000.0 * result;
|
||||
}
|
||||
|
||||
f = fopen("/sys/devices/system/cpu/cpu0/clock_tick", "r");
|
||||
if (f) {
|
||||
s = fscanf(f,"%lf",&result);
|
||||
fclose(f);
|
||||
if (s > 0) return result;
|
||||
}
|
||||
|
||||
f = fopen("/proc/cpuinfo","r");
|
||||
if (f) {
|
||||
for (;;) {
|
||||
s = fscanf(f,"cpu MHz : %lf",&result);
|
||||
if (s > 0) break;
|
||||
if (s == 0) s = fscanf(f,"%*[^\n]\n");
|
||||
if (s < 0) { result = 0; break; }
|
||||
}
|
||||
fclose(f);
|
||||
if (result) return 1000000.0 * result;
|
||||
}
|
||||
|
||||
f = fopen("/proc/cpuinfo","r");
|
||||
if (f) {
|
||||
for (;;) {
|
||||
s = fscanf(f,"clock : %lf",&result);
|
||||
if (s > 0) break;
|
||||
if (s == 0) s = fscanf(f,"%*[^\n]\n");
|
||||
if (s < 0) { result = 0; break; }
|
||||
}
|
||||
fclose(f);
|
||||
if (result) return 1000000.0 * result;
|
||||
}
|
||||
|
||||
f = fopen("/proc/cpuinfo","r");
|
||||
if (f) {
|
||||
for (;;) {
|
||||
s = fscanf(f,"cpu MHz static : %lf",&result);
|
||||
if (s > 0) break;
|
||||
if (s == 0) s = fscanf(f,"%*[^\n]\n");
|
||||
if (s < 0) { result = 0; break; }
|
||||
}
|
||||
fclose(f);
|
||||
if (result) return 1000000.0 * result;
|
||||
}
|
||||
|
||||
f = popen("sysctl hw.cpufrequency 2>/dev/null","r");
|
||||
if (f) {
|
||||
s = fscanf(f,"hw.cpufrequency: %lf",&result);
|
||||
pclose(f);
|
||||
if (s > 0) if (result > 0) return result;
|
||||
}
|
||||
|
||||
f = popen("/usr/sbin/lsattr -E -l proc0 -a frequency 2>/dev/null","r");
|
||||
if (f) {
|
||||
s = fscanf(f,"frequency %lf",&result);
|
||||
pclose(f);
|
||||
if (s > 0) return result;
|
||||
}
|
||||
|
||||
f = popen("/usr/sbin/psrinfo -v 2>/dev/null","r");
|
||||
if (f) {
|
||||
for (;;) {
|
||||
s = fscanf(f," The %*s processor operates at %lf MHz",&result);
|
||||
if (s > 0) break;
|
||||
if (s == 0) s = fscanf(f,"%*[^\n]\n");
|
||||
if (s < 0) { result = 0; break; }
|
||||
}
|
||||
pclose(f);
|
||||
if (result) return 1000000.0 * result;
|
||||
}
|
||||
|
||||
x = getenv("cpucyclespersecond");
|
||||
if (x) {
|
||||
s = sscanf(x,"%lf",&result);
|
||||
if (s > 0) return result;
|
||||
}
|
||||
|
||||
return 2399987654.0;
|
||||
}
|
||||
|
||||
static long long persecond = 0;
|
||||
static const char *implementation = "none";
|
||||
|
||||
long long (*cpucycles)(void) = cpucycles_init;
|
||||
|
||||
const char *cpucycles_implementation(void)
|
||||
{
|
||||
cpucycles();
|
||||
return implementation;
|
||||
}
|
||||
|
||||
long long cpucycles_persecond(void)
|
||||
{
|
||||
cpucycles();
|
||||
return persecond;
|
||||
}
|
||||
|
||||
const char *cpucycles_version(void)
|
||||
{
|
||||
return "20230115";
|
||||
}
|
||||
|
||||
// ----- cycle counter scaled from ticks
|
||||
|
||||
static double cpucycles_scaled_scaling = 0;
|
||||
static long long cpucycles_scaled_offset = 0;
|
||||
static long long (*cpucycles_scaled_from)(void) = 0;
|
||||
|
||||
static long long cpucycles_scaled(void)
|
||||
{
|
||||
return (cpucycles_scaled_from()-cpucycles_scaled_offset)*cpucycles_scaled_scaling;
|
||||
}
|
||||
|
||||
// ----- cycle counter extended from 32-bit ticks
|
||||
|
||||
static long long (*cpucycles_extend32_from)(void) = 0;
|
||||
|
||||
static uint32_t cpucycles_extend32_prev_ticks;
|
||||
static long long cpucycles_extend32_prev_us;
|
||||
static long long cpucycles_extend32_prev_cycles;
|
||||
|
||||
static void cpucycles_extend32_setup(void)
|
||||
{
|
||||
long long (*ticks)(void) = cpucycles_extend32_from;
|
||||
cpucycles_extend32_prev_ticks = ticks();
|
||||
cpucycles_extend32_prev_us = cpucycles_microseconds();
|
||||
cpucycles_extend32_prev_cycles = 0;
|
||||
}
|
||||
|
||||
static long long cpucycles_extend32(void)
|
||||
{
|
||||
long long (*ticks)(void) = cpucycles_extend32_from;
|
||||
|
||||
uint32_t new_ticks = ticks();
|
||||
unsigned long long delta_ticks = new_ticks-cpucycles_extend32_prev_ticks;
|
||||
long long new_us = cpucycles_microseconds();
|
||||
long long delta_us = new_us-cpucycles_extend32_prev_us;
|
||||
|
||||
// assume that number of cycles cannot increase by 2^32 in 2ms
|
||||
|
||||
if (delta_us < 1000)
|
||||
return cpucycles_extend32_prev_cycles+delta_ticks;
|
||||
|
||||
cpucycles_extend32_prev_ticks = new_ticks;
|
||||
cpucycles_extend32_prev_us = new_us;
|
||||
|
||||
if (delta_us >= 2000) {
|
||||
long long target = (delta_us*0.000001)*persecond;
|
||||
while (delta_ticks+2147483648ULL < target)
|
||||
delta_ticks += 4294967296ULL;
|
||||
}
|
||||
|
||||
return cpucycles_extend32_prev_cycles += delta_ticks;
|
||||
}
|
||||
|
||||
// ----- estimating cycles per tick
|
||||
|
||||
long long cpucycles_microseconds(void)
|
||||
{
|
||||
struct timeval t;
|
||||
long long result;
|
||||
gettimeofday(&t,(struct timezone *) 0);
|
||||
result = t.tv_sec;
|
||||
result *= 1000000;
|
||||
result += t.tv_usec;
|
||||
return result;
|
||||
}
|
||||
|
||||
static double estimate_cyclespertick(long long (*ticks)(void))
|
||||
{
|
||||
long long t0,t1,us0,us1;
|
||||
|
||||
t0 = ticks();
|
||||
us0 = cpucycles_microseconds();
|
||||
do {
|
||||
t1 = ticks();
|
||||
us1 = cpucycles_microseconds();
|
||||
} while (us1-us0 < 10000 || t1-t0 < 1000);
|
||||
if (t1 <= t0) return 0;
|
||||
t1 -= t0;
|
||||
us1 -= us0;
|
||||
return (persecond * 0.000001 * (double) us1) / (double) t1;
|
||||
}
|
||||
|
||||
// ----- selecting an option
|
||||
|
||||
#include "options.inc"
|
||||
|
||||
#define CALLS 1000
|
||||
#define ESTIMATES 3
|
||||
|
||||
long long cpucycles_init(void)
|
||||
{
|
||||
long long precision[NUMOPTIONS];
|
||||
double scaling[NUMOPTIONS];
|
||||
int only32[NUMOPTIONS];
|
||||
long long bestprecision;
|
||||
long long bestopt;
|
||||
long long opt;
|
||||
|
||||
persecond = osfreq();
|
||||
|
||||
for (opt = 0;opt < NUMOPTIONS;++opt) {
|
||||
long long freq = options[opt].ticks_setup();
|
||||
long long tries;
|
||||
|
||||
precision[opt] = 0;
|
||||
scaling[opt] = 0;
|
||||
only32[opt] = 0;
|
||||
|
||||
if (freq > 0) {
|
||||
scaling[opt] = persecond*1.0/freq;
|
||||
} else if (freq == cpucycles_CYCLECOUNTER) {
|
||||
scaling[opt] = 1.0;
|
||||
} else if (freq == cpucycles_EXTEND32) {
|
||||
only32[opt] = 1;
|
||||
scaling[opt] = 1.0;
|
||||
} else if (freq == cpucycles_MAYBECYCLECOUNTER) {
|
||||
scaling[opt] = 1.0;
|
||||
} else if (freq == cpucycles_FINDMULTIPLIER) {
|
||||
int ok = 0;
|
||||
double denom;
|
||||
long long loop;
|
||||
|
||||
for (denom = 1;denom <= 1024;denom += denom) {
|
||||
double est[ESTIMATES];
|
||||
for (loop = 0;loop < ESTIMATES;++loop)
|
||||
est[loop] = denom*estimate_cyclespertick(options[opt].ticks);
|
||||
scaling[opt] = (double) (long long) est[0];
|
||||
if (scaling[opt] < est[0]-0.5) scaling[opt] += 1;
|
||||
if (scaling[opt] > est[0]+0.5) scaling[opt] -= 1;
|
||||
ok = 1;
|
||||
for (loop = 0;loop < ESTIMATES;++loop) {
|
||||
if (est[loop]-scaling[opt] > 0.1) ok = 0;
|
||||
if (scaling[opt]-est[loop] > 0.1) ok = 0;
|
||||
}
|
||||
if (ok) {
|
||||
scaling[opt] /= denom;
|
||||
break;
|
||||
}
|
||||
scaling[opt] = 0;
|
||||
}
|
||||
if (!ok) continue;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (tries = 0;tries < 10;++tries) {
|
||||
long long t[CALLS+1];
|
||||
long long ok = 1;
|
||||
long long i;
|
||||
|
||||
if (scaling[opt] == 1.0) {
|
||||
for (i = 0;i <= CALLS;++i)
|
||||
t[i] = options[opt].ticks();
|
||||
} else {
|
||||
double scalingopt = scaling[opt];
|
||||
long long offset = options[opt].ticks();
|
||||
for (i = 0;i <= CALLS;++i)
|
||||
t[i] = (options[opt].ticks()-offset)*scalingopt;
|
||||
}
|
||||
for (i = 0;i < CALLS;++i)
|
||||
if (t[i] > t[i+1])
|
||||
ok = 0;
|
||||
if (t[0] == t[CALLS])
|
||||
ok = 0;
|
||||
|
||||
if (ok) {
|
||||
long long smallestdiff = 0;
|
||||
for (i = 0;i < CALLS;++i) {
|
||||
long long diff = t[i+1]-t[i];
|
||||
if (diff <= 0) continue;
|
||||
if (smallestdiff == 0 || diff < smallestdiff)
|
||||
smallestdiff = diff;
|
||||
}
|
||||
precision[opt] = smallestdiff;
|
||||
|
||||
// tilt selection towards more robust counters
|
||||
if (freq != cpucycles_CYCLECOUNTER && freq != cpucycles_EXTEND32)
|
||||
precision[opt] += 100;
|
||||
if (freq > 0)
|
||||
precision[opt] += 100;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// otherwise keep trying
|
||||
// since !ok can be caused by overflow
|
||||
// or by core swap
|
||||
}
|
||||
}
|
||||
|
||||
if (tracesetup) {
|
||||
for (opt = 0;opt < NUMOPTIONS;++opt)
|
||||
printf("cpucycles tracesetup %lld %s precision %lld scaling %lf only32 %d\n"
|
||||
,opt,options[opt].implementation,precision[opt],scaling[opt],only32[opt]);
|
||||
}
|
||||
|
||||
bestopt = DEFAULTOPTION;
|
||||
bestprecision = 0;
|
||||
for (opt = 0;opt < NUMOPTIONS;++opt)
|
||||
if (precision[opt] > 0)
|
||||
if (!bestprecision || precision[opt] < bestprecision) {
|
||||
bestopt = opt;
|
||||
bestprecision = precision[opt];
|
||||
}
|
||||
|
||||
implementation = options[bestopt].implementation;
|
||||
|
||||
if (scaling[bestopt] == 1.0) {
|
||||
if (only32[bestopt]) {
|
||||
cpucycles_extend32_from = options[bestopt].ticks;
|
||||
cpucycles_extend32_setup();
|
||||
cpucycles = cpucycles_extend32;
|
||||
} else {
|
||||
cpucycles = options[bestopt].ticks;
|
||||
}
|
||||
} else {
|
||||
cpucycles_scaled_scaling = scaling[bestopt];
|
||||
cpucycles_scaled_from = options[bestopt].ticks;
|
||||
cpucycles_scaled_offset = cpucycles_scaled_from();
|
||||
cpucycles = cpucycles_scaled;
|
||||
}
|
||||
|
||||
return cpucycles();
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
return __rdtsc();
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_MAYBECYCLECOUNTER;
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
// version 20230105
|
||||
// public domain
|
||||
// djb
|
||||
|
||||
#include "cpucycles_internal.h"
|
||||
|
||||
#ifndef __i386__
|
||||
#error this code is only for 32-bit x86 platforms
|
||||
#endif
|
||||
|
||||
long long ticks(void)
|
||||
{
|
||||
long long result;
|
||||
asm volatile(".byte 15;.byte 49" : "=A" (result));
|
||||
return result;
|
||||
}
|
||||
|
||||
long long ticks_setup(void)
|
||||
{
|
||||
if (!cpucycles_works(ticks)) return cpucycles_SKIP;
|
||||
return cpucycles_MAYBECYCLECOUNTER;
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
### NAME
|
||||
|
||||
cpucycles - count CPU cycles
|
||||
|
||||
### SYNOPSIS
|
||||
|
||||
#include <cpucycles.h>
|
||||
|
||||
long long count = cpucycles();
|
||||
long long persecond = cpucycles_persecond();
|
||||
const char *implementation = cpucycles_implementation();
|
||||
const char *version = cpucycles_version();
|
||||
|
||||
Link with `-lcpucycles`. Old systems may also need `-lrt`.
|
||||
|
||||
### DESCRIPTION
|
||||
|
||||
`cpucycles()` returns an estimate for the number of CPU cycles that have
|
||||
occurred since an unspecified time in the past (perhaps system boot,
|
||||
perhaps program startup).
|
||||
|
||||
Accessing true cycle counters can be difficult on some CPUs and
|
||||
operating systems. `cpucycles()` does its best to produce accurate
|
||||
results, but selects a low-precision counter if the only other option is
|
||||
failure.
|
||||
|
||||
`cpucycles_persecond()` returns an estimate for the number of CPU cycles
|
||||
per second. This estimate comes from `/etc/cpucyclespersecond` if that
|
||||
file exists, otherwise from various OS mechanisms, otherwise from the
|
||||
`cpucyclespersecond` environment variable if that is set, otherwise
|
||||
2399987654.
|
||||
|
||||
`cpucycles_implementation()` returns the name of the counter in use:
|
||||
e.g., `"amd64-pmc"`.
|
||||
|
||||
`cpucycles_version()` returns the `libcpucycles` version number as a
|
||||
string: e.g., `"20230115"`. Results of `cpucycles_implementation()`
|
||||
should be interpreted relative to `cpucycles_version()`.
|
||||
|
||||
`cpucycles` is actually a function pointer. The first call to
|
||||
`cpucycles()` or `cpucycles_persecond()` or `cpucycles_implementation()`
|
||||
selects one of the available counters and updates the `cpucycles`
|
||||
pointer accordingly. Subsequent calls to `cpucycles()` are thread-safe.
|
||||
|
||||
### SEE ALSO
|
||||
|
||||
**gettimeofday**(2), **clock_gettime**(2)
|
||||
@@ -0,0 +1,447 @@
|
||||
Currently libcpucycles supports the following cycle counters. Some
|
||||
cycle counters are actually other forms of counters that libcpucycles
|
||||
scales to imitate a cycle counter. There is
|
||||
[separate documentation](selection.html)
|
||||
for how libcpucycles makes a choice of cycle counter. See also
|
||||
[security considerations](security.html) regarding enabling or disabling
|
||||
counters and regarding Turbo Boost.
|
||||
|
||||
`amd64-pmc`: Requires a 64-bit Intel/AMD platform. Requires the Linux
|
||||
perf_event interface. Accesses a cycle counter through RDPMC. Requires
|
||||
`/proc/sys/kernel/perf_event_paranoid` to be at most 2 for user-level
|
||||
RDPMC access. This counter runs at the clock frequency of the CPU core.
|
||||
|
||||
`amd64-tsc`, `amd64-tscasm`: Requires a 64-bit Intel/AMD platform.
|
||||
Requires RDTSC to be enabled, which it is by default. Uses RDTSC to
|
||||
access the CPU's time-stamp counter. On current CPUs, this is an
|
||||
off-core clock rather than a cycle counter, but it is typically a very
|
||||
fast off-core clock, making it adequate for seeing cycle counts if
|
||||
overclocking and underclocking are disabled. The difference between
|
||||
`tsc` and `tscasm` is that `tsc` uses the compiler's `__rdtsc()` while
|
||||
`tscasm` uses inline assembly.
|
||||
|
||||
`arm32-cortex`: Requires a 32-bit ARMv7-A platform. Uses
|
||||
`mrc p15, 0, %0, c9, c13, 0` to read the cycle counter. Requires user
|
||||
access to the cycle counter, which is not enabled by default but can be
|
||||
enabled under Linux via
|
||||
[a kernel module](https://github.com/thoughtpolice/enable_arm_pmu).
|
||||
This counter is natively 32 bits, but libcpucycles watches how the
|
||||
counter and `gettimeofday` increase to compute a 64-bit extension of the
|
||||
counter.
|
||||
|
||||
`arm64-pmc`: Requires a 64-bit ARMv8-A platform. Uses
|
||||
`mrs %0, PMCCNTR_EL0` to read the cycle counter. Requires user access
|
||||
to the cycle counter, which is not enabled by default but can be enabled
|
||||
under Linux via
|
||||
[a kernel module](https://github.com/rdolbeau/enable_arm_pmu).
|
||||
|
||||
`arm64-vct`: Requires a 64-bit ARMv8-A platform. Uses
|
||||
`mrs %0, CNTVCT_EL0` to read a "virtual count" timer. This is an
|
||||
off-core clock, typically running at 24MHz. Results are scaled by
|
||||
libcpucycles.
|
||||
|
||||
`mips64-cc`: Requires a 64-bit MIPS platform. (Maybe the same code would
|
||||
also work as `mips32-cc`, but this has not been tested yet.) Uses RDHWR
|
||||
to read the hardware cycle counter (hardware register 2 times a constant
|
||||
scale factor in hardware register 3). This counter is natively 32 bits,
|
||||
but libcpucycles watches how the counter and `gettimeofday` increase to
|
||||
compute a 64-bit extension of the counter.
|
||||
|
||||
`ppc32-mftb`: Requires a 32-bit PowerPC platform. Uses `mftb` and
|
||||
`mftbu` to read the "time base". This is an off-core clock, typically
|
||||
running at 24MHz.
|
||||
|
||||
`ppc64-mftb`: Requires a 64-bit PowerPC platform. Uses `mftb` and
|
||||
`mftbu` to read the "time base". This is an off-core clock, typically
|
||||
running at 24MHz.
|
||||
|
||||
`riscv32-rdcycle`: Requires a 32-bit RISC-V platform. Uses `rdcycle`
|
||||
and `rdcycleh` to read a cycle counter.
|
||||
|
||||
`riscv64-rdcycle`: Requires a 64-bit RISC-V platform. Uses `rdcycle`
|
||||
to read a cycle counter.
|
||||
|
||||
`s390x-stckf`: Requires a 64-bit z/Architecture platform. Uses `stckf`
|
||||
to read the TOD clock, which is documented to run at 4096MHz. On the
|
||||
z15, this looks like a doubling of an off-core 2048MHz clock. Results
|
||||
are scaled by libcpucycles.
|
||||
|
||||
`sparc64-rdtick`: Requires a 64-bit SPARC platform. Uses `rd %tick`
|
||||
to read a cycle counter.
|
||||
|
||||
`x86-tsc`, `x86-tscasm`: Same as `amd64-tsc` and `amd64-tscasm`, but
|
||||
for 32-bit Intel/AMD platforms instead of 64-bit Intel/AMD platforms.
|
||||
|
||||
`default-gettimeofday`: Reasonably portable. Resolution is limited to 1
|
||||
microsecond. Results are scaled by libcpucycles.
|
||||
|
||||
`default-mach`: Requires an OS with `mach_absolute_time()`. Typically
|
||||
runs at 24MHz. Results are scaled by libcpucycles.
|
||||
|
||||
`default-monotonic`: Requires `CLOCK_MONOTONIC`. Reasonably portable,
|
||||
although might fail on older systems where `default-gettimeofday` works.
|
||||
Resolution is limited to 1 nanosecond. Can be almost as good as a cycle
|
||||
counter, or orders of magnitude worse, depending on the OS and CPU.
|
||||
Results are scaled by libcpucycles.
|
||||
|
||||
`default-perfevent`: Requires the Linux `perf_event` interface, and a
|
||||
CPU where `perf_event` supports `PERF_COUNT_HW_CPU_CYCLES`. Similar
|
||||
variations in quality to `default-monotonic`, without the 1-nanosecond
|
||||
limitation.
|
||||
|
||||
`default-zero`: The horrifying last resort if nothing else works.
|
||||
|
||||
## Examples
|
||||
|
||||
These are examples of `cpucycles-info` output on various machines. The
|
||||
machines named `gcc*` are from the
|
||||
[GCC Compile Farm](https://gcc.gnu.org/wiki/CompileFarm).
|
||||
|
||||
A `median` line saying, e.g., `47 +47+28+0+2-5+0+2-5...` means that the
|
||||
differences between adjacent cycle counts were 47+47, 47+28, 47+0, 47+2,
|
||||
47−5, 47+0, 47+2, 47−5, etc., with median difference 47. The first few
|
||||
differences are typically larger because of cache effects.
|
||||
|
||||
`pi3aplus`,
|
||||
Broadcom BCM2837B0:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 arm64-pmc precision 9 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 arm64-vct precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 2 default-perfevent precision 189 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 4 default-monotonic precision 272 scaling 1.400000 only32 0
|
||||
cpucycles tracesetup 5 default-gettimeofday precision 1600 scaling 1400.000000 only32 0
|
||||
cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 1400000000
|
||||
cpucycles implementation arm64-pmc
|
||||
cpucycles median 10 +10+8+3+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
|
||||
cpucycles observed persecond 1032000000...4224666667 with 1024 loops 4 microseconds
|
||||
cpucycles observed persecond 1286000000...1756000000 with 2048 loops 7 microseconds
|
||||
cpucycles observed persecond 1368266666...1598000000 with 4096 loops 14 microseconds
|
||||
cpucycles observed persecond 1366700000...1473428572 with 8192 loops 29 microseconds
|
||||
cpucycles observed persecond 1366100000...1417534483 with 16384 loops 59 microseconds
|
||||
cpucycles observed persecond 1332739837...1357132232 with 32768 loops 122 microseconds
|
||||
cpucycles observed persecond 1354483471...1366945834 with 65536 loops 241 microseconds
|
||||
cpucycles observed persecond 1385684989...1392195330 with 131072 loops 472 microseconds
|
||||
cpucycles observed persecond 1347223021...1350328528 with 262144 loops 972 microseconds
|
||||
cpucycles observed persecond 1375460125...1377069853 with 524288 loops 1905 microseconds
|
||||
cpucycles observed persecond 1376527697...1377335961 with 1048576 loops 3808 microseconds
|
||||
```
|
||||
|
||||
`bblack`,
|
||||
TI Sitara XAM3359AZCZ100:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 arm32-cortex precision 8 scaling 1.000000 only32 1
|
||||
cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 1283 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 1200 scaling 1000.000000 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 1000000000
|
||||
cpucycles implementation arm32-cortex
|
||||
cpucycles median 1260 +1506+62+31+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+13+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
|
||||
cpucycles observed persecond 622181818...2101888889 with 1024 loops 10 microseconds
|
||||
cpucycles observed persecond 806133333...1492615385 with 2048 loops 14 microseconds
|
||||
cpucycles observed persecond 879880000...1232565218 with 4096 loops 24 microseconds
|
||||
cpucycles observed persecond 939577777...1130581396 with 8192 loops 44 microseconds
|
||||
cpucycles observed persecond 956954022...1050047059 with 16384 loops 86 microseconds
|
||||
cpucycles observed persecond 982878542...1020685715 with 32768 loops 246 microseconds
|
||||
cpucycles observed persecond 988105105...1012217523 with 65536 loops 332 microseconds
|
||||
cpucycles observed persecond 993752077...1007159723 with 131072 loops 721 microseconds
|
||||
cpucycles observed persecond 995364296...1004009448 with 262144 loops 1377 microseconds
|
||||
cpucycles observed persecond 998216306...1001821536 with 524288 loops 2685 microseconds
|
||||
cpucycles observed persecond 998991848...1000914196 with 1048576 loops 5397 microseconds
|
||||
```
|
||||
|
||||
`hiphop`,
|
||||
Intel Xeon E3-1220 v3:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 amd64-pmc precision 40 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 amd64-tsc precision 124 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 amd64-tscasm precision 124 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-perfevent precision 160 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 5 default-monotonic precision 272 scaling 3.100000 only32 0
|
||||
cpucycles tracesetup 6 default-gettimeofday precision 3300 scaling 3100.000000 only32 0
|
||||
cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 3100000000
|
||||
cpucycles implementation amd64-pmc
|
||||
cpucycles median 44 +38+23+23+23-4+0-4+0-4+0-4+0+10-4-2+1-4+1-4+1+17+1-4+1-4+1-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4
|
||||
cpucycles observed persecond 2066500000...4235000000 with 8192 loops 3 microseconds
|
||||
cpucycles observed persecond 2760833333...4200250000 with 16384 loops 5 microseconds
|
||||
cpucycles observed persecond 2743416666...3313100000 with 32768 loops 11 microseconds
|
||||
cpucycles observed persecond 2986227272...3295000000 with 65536 loops 21 microseconds
|
||||
cpucycles observed persecond 3052069767...3206073171 with 131072 loops 42 microseconds
|
||||
cpucycles observed persecond 3050395348...3125523810 with 262144 loops 85 microseconds
|
||||
cpucycles observed persecond 3085123529...3123059524 with 524288 loops 169 microseconds
|
||||
cpucycles observed persecond 3084561764...3103434912 with 1048576 loops 339 microseconds
|
||||
```
|
||||
|
||||
`nucnuc`,
|
||||
Intel Pentium N3700:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 amd64-pmc precision 26 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 amd64-tsc precision 120 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 amd64-tscasm precision 120 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-perfevent precision 427 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 5 default-monotonic precision 320 scaling 1.600000 only32 0
|
||||
cpucycles tracesetup 6 default-gettimeofday precision 1800 scaling 1600.000000 only32 0
|
||||
cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 1600000000
|
||||
cpucycles implementation amd64-pmc
|
||||
cpucycles median 66 +12+12+14+14-1-1+0-1+0-1+0-1+0+1-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+1-1+0-2-1-1+0-1+0-1+0-2+0-1+2+0-1+0-1+0+0-1
|
||||
cpucycles observed persecond 1060500000...2325000000 with 2048 loops 3 microseconds
|
||||
cpucycles observed persecond 1387166666...2208250000 with 4096 loops 5 microseconds
|
||||
cpucycles observed persecond 1376083333...1705500000 with 8192 loops 11 microseconds
|
||||
cpucycles observed persecond 1495727272...1671800000 with 16384 loops 21 microseconds
|
||||
cpucycles observed persecond 1563428571...1655100000 with 32768 loops 41 microseconds
|
||||
cpucycles observed persecond 1580807228...1626234568 with 65536 loops 82 microseconds
|
||||
cpucycles observed persecond 1589539393...1612619632 with 131072 loops 164 microseconds
|
||||
cpucycles observed persecond 1598841463...1610230062 with 262144 loops 327 microseconds
|
||||
cpucycles observed persecond 1564336810...1569988042 with 524288 loops 670 microseconds
|
||||
cpucycles observed persecond 1599759725...1602608098 with 1048576 loops 1310 microseconds
|
||||
```
|
||||
|
||||
`saber214`,
|
||||
AMD FX-8350:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 amd64-pmc precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 1 amd64-tsc precision 167 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 amd64-tscasm precision 168 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-perfevent precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 5 default-monotonic precision 376 scaling 4.013452 only32 0
|
||||
cpucycles tracesetup 6 default-gettimeofday precision 4213 scaling 4013.452000 only32 0
|
||||
cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 4013452000
|
||||
cpucycles implementation amd64-tsc
|
||||
cpucycles median 77 +87-2+21+7+4+1+0+2-2-7-4+0+1+4-2+3+1-2-2+5-6+2+2+2+2+1-1-1+0-4+0-1-1-1-2+3-1-1+2-2+0+0+2+0+0+2-2-2+1-1-2+2-5+2+0+2+0+1+0+3-2-1-1
|
||||
cpucycles observed persecond 2767500000...5759000000 with 4096 loops 3 microseconds
|
||||
cpucycles observed persecond 3426000000...4893800000 with 8192 loops 6 microseconds
|
||||
cpucycles observed persecond 3724076923...4446363637 with 16384 loops 12 microseconds
|
||||
cpucycles observed persecond 3977833333...4363318182 with 32768 loops 23 microseconds
|
||||
cpucycles observed persecond 3984854166...4168739131 with 65536 loops 47 microseconds
|
||||
cpucycles observed persecond 3981709923...4048193799 with 131072 loops 130 microseconds
|
||||
cpucycles observed persecond 3982716417...4026914573 with 262144 loops 200 microseconds
|
||||
cpucycles observed persecond 4001637602...4025136987 with 524288 loops 366 microseconds
|
||||
cpucycles observed persecond 4007411111...4018600248 with 1048576 loops 809 microseconds
|
||||
```
|
||||
|
||||
`gcc14`,
|
||||
Intel Xeon E5-2620 v3,
|
||||
Debian testing (bookworm),
|
||||
Linux kernel 6.0.0-6-amd64:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 amd64-pmc precision 41 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 amd64-tsc precision 148 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 amd64-tscasm precision 148 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-perfevent precision 159 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 5 default-monotonic precision 289 scaling 3.200000 only32 0
|
||||
cpucycles tracesetup 6 default-gettimeofday precision 3400 scaling 3200.000000 only32 0
|
||||
cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 3200000000
|
||||
cpucycles implementation amd64-pmc
|
||||
cpucycles median 47 +47+28+0+2-5+0+2-5+16+2-5+0+2-5+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0
|
||||
cpucycles observed persecond 1653800000...2819333334 with 8192 loops 4 microseconds
|
||||
cpucycles observed persecond 1832111111...2389285715 with 16384 loops 8 microseconds
|
||||
cpucycles observed persecond 1936058823...2207200000 with 32768 loops 16 microseconds
|
||||
cpucycles observed persecond 2052843750...2196200000 with 65536 loops 31 microseconds
|
||||
cpucycles observed persecond 2050750000...2120048388 with 131072 loops 63 microseconds
|
||||
cpucycles observed persecond 2081896825...2117048388 with 262144 loops 125 microseconds
|
||||
cpucycles observed persecond 2089478087...2107044177 with 524288 loops 250 microseconds
|
||||
cpucycles observed persecond 2093343313...2102124249 with 1048576 loops 500 microseconds
|
||||
```
|
||||
|
||||
`gcc23`,
|
||||
Cavium Octeon II V0.1,
|
||||
Debian 8.11,
|
||||
Linux kernel 4.1.4:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 mips64-cc precision 24 scaling 1.000000 only32 1
|
||||
cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 46702 scaling 2.399988 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 45799 scaling 2399.987654 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 2399987654
|
||||
cpucycles implementation mips64-cc
|
||||
cpucycles median 2177 +828+17+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
|
||||
cpucycles observed persecond 641900000...1845125000 with 1024 loops 9 microseconds
|
||||
cpucycles observed persecond 745357142...1352083334 with 2048 loops 13 microseconds
|
||||
cpucycles observed persecond 809826086...1162333334 with 4096 loops 22 microseconds
|
||||
cpucycles observed persecond 897717948...1104405406 with 8192 loops 38 microseconds
|
||||
cpucycles observed persecond 957467532...1059986667 with 16384 loops 76 microseconds
|
||||
cpucycles observed persecond 973102189...1029777778 with 32768 loops 136 microseconds
|
||||
cpucycles observed persecond 986518656...1015830828 with 65536 loops 267 microseconds
|
||||
cpucycles observed persecond 993452830...1008166667 with 131072 loops 529 microseconds
|
||||
cpucycles observed persecond 996036966...1003403609 with 262144 loops 1054 microseconds
|
||||
cpucycles observed persecond 984706378...1001682630 with 524288 loops 2131 microseconds
|
||||
cpucycles observed persecond 992585292...1001178580 with 1048576 loops 4296 microseconds
|
||||
```
|
||||
|
||||
`gcc45`,
|
||||
AMD Athlon II X4 640,
|
||||
Debian 8.11,
|
||||
Linux kernel 3.16.0-11-686-pae:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 x86-tsc precision 199 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 x86-tscasm precision 199 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 default-perfevent precision 170 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 4 default-monotonic precision 941 scaling 3.000000 only32 0
|
||||
cpucycles tracesetup 5 default-gettimeofday precision 3200 scaling 3000.000000 only32 0
|
||||
cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 3000000000
|
||||
cpucycles implementation default-perfevent
|
||||
cpucycles median 72 +12+0+0+0+0+0+0+0+5+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0
|
||||
cpucycles observed persecond 541500000...1812000000 with 1024 loops 3 microseconds
|
||||
cpucycles observed persecond 712333333...1212250000 with 2048 loops 5 microseconds
|
||||
cpucycles observed persecond 1193285714...1733600000 with 4096 loops 6 microseconds
|
||||
cpucycles observed persecond 1689176470...1804562500 with 8192 loops 33 microseconds
|
||||
cpucycles observed persecond 1713074626...1770600000 with 16384 loops 66 microseconds
|
||||
cpucycles observed persecond 1765107692...1795140625 with 32768 loops 129 microseconds
|
||||
cpucycles observed persecond 1785369649...1800603922 with 65536 loops 256 microseconds
|
||||
cpucycles observed persecond 1781377862...1796288462 with 131072 loops 261 microseconds
|
||||
cpucycles observed persecond 1772647398...1778247827 with 262144 loops 691 microseconds
|
||||
cpucycles observed persecond 1789670493...1794149598 with 524288 loops 870 microseconds
|
||||
cpucycles observed persecond 1860276211...1861561332 with 1048576 loops 3156 microseconds
|
||||
```
|
||||
|
||||
`gcc92`,
|
||||
SiFive Freedom U740,
|
||||
Ubuntu 22.04,
|
||||
Linux kernel 5.15.0-1014-generic:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 riscv64-rdcycle precision 8 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 default-perfevent precision 3024 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 2599 scaling 2.399988 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 2599 scaling 2399.987654 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 2399987654
|
||||
cpucycles implementation riscv64-rdcycle
|
||||
cpucycles median 8 +33+27+1+1+1+1+0+0+0+22+0+0+0+0+0+0+0+628+0+0+0+7+0+0+0+145+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+158+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+22+0+0+0+0+0
|
||||
cpucycles observed persecond 530250000...1978000000 with 1024 loops 3 microseconds
|
||||
cpucycles observed persecond 831000000...1915666667 with 2048 loops 4 microseconds
|
||||
cpucycles observed persecond 1055750000...1689500000 with 4096 loops 7 microseconds
|
||||
cpucycles observed persecond 1045562500...1305428572 with 8192 loops 15 microseconds
|
||||
cpucycles observed persecond 1102700000...1236357143 with 16384 loops 29 microseconds
|
||||
cpucycles observed persecond 1176053571...1247444445 with 32768 loops 55 microseconds
|
||||
cpucycles observed persecond 1173321428...1209127273 with 65536 loops 111 microseconds
|
||||
cpucycles observed persecond 1187805429...1205210046 with 131072 loops 220 microseconds
|
||||
cpucycles observed persecond 1192415909...1201157535 with 262144 loops 439 microseconds
|
||||
cpucycles observed persecond 1194694760...1199247717 with 524288 loops 877 microseconds
|
||||
cpucycles observed persecond 1194656004...1197023034 with 1048576 loops 1781 microseconds
|
||||
```
|
||||
|
||||
`gcc103`,
|
||||
Apple M1 (Icestorm-M1 + Firestorm-M1),
|
||||
Debian unstable (bookworm),
|
||||
Linux kernel 6.0.0-rc5-asahi-00001-gc62bd3fe430f:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 arm64-pmc precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 1 arm64-vct precision 186 scaling 86.000000 only32 0
|
||||
cpucycles tracesetup 2 default-perfevent precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 4 default-monotonic precision 285 scaling 2.064000 only32 0
|
||||
cpucycles tracesetup 5 default-gettimeofday precision 2264 scaling 2064.000000 only32 0
|
||||
cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 2064000000
|
||||
cpucycles implementation arm64-vct
|
||||
cpucycles median 0 +0+86+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0
|
||||
cpucycles observed persecond 1784500000...3655000000 with 8192 loops 3 microseconds
|
||||
cpucycles observed persecond 1773750000...2393666667 with 16384 loops 7 microseconds
|
||||
cpucycles observed persecond 1897733333...2222769231 with 32768 loops 14 microseconds
|
||||
cpucycles observed persecond 1951310344...2114962963 with 65536 loops 28 microseconds
|
||||
cpucycles observed persecond 2024071428...2107000000 with 131072 loops 55 microseconds
|
||||
cpucycles observed persecond 2041531531...2082935780 with 262144 loops 110 microseconds
|
||||
cpucycles observed persecond 2051158371...2071461188 with 524288 loops 220 microseconds
|
||||
cpucycles observed persecond 2058539682...2068309795 with 1048576 loops 440 microseconds
|
||||
```
|
||||
|
||||
`gcc112` (`gcc2-power8`),
|
||||
IBM POWER8E,
|
||||
CentOS 7.9 AltArch,
|
||||
Linux kernel 3.10.0-1127.13.1.el7.ppc64le:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 ppc64-mftb precision 251 scaling 7.207031 only32 0
|
||||
cpucycles tracesetup 1 default-perfevent precision 295 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 536 scaling 3.690000 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 3890 scaling 3690.000000 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 3690000000
|
||||
cpucycles implementation ppc64-mftb
|
||||
cpucycles median 195 +2969-8+14+0-8+7-8-7+7+6-7-1+0-1+0+7+7-15+7-1-7+6+0+0-8+0+6+0-8+7+0+7-8-8-7-1+7-8+7+0-8+0+14-8-7+6+0-8+7+7-15+0-1+0-1+14+0-15+14+0-1+7+0
|
||||
cpucycles observed persecond 2603750000...5510000000 with 2048 loops 3 microseconds
|
||||
cpucycles observed persecond 3430500000...6052250000 with 4096 loops 5 microseconds
|
||||
cpucycles observed persecond 3411333333...4457500000 with 8192 loops 11 microseconds
|
||||
cpucycles observed persecond 3548695652...4060333334 with 16384 loops 22 microseconds
|
||||
cpucycles observed persecond 3624977777...3876534884 with 32768 loops 44 microseconds
|
||||
cpucycles observed persecond 3621855555...3745363637 with 65536 loops 89 microseconds
|
||||
cpucycles observed persecond 3660157303...3722227273 with 131072 loops 177 microseconds
|
||||
cpucycles observed persecond 3680471751...3711622160 with 262144 loops 353 microseconds
|
||||
cpucycles observed persecond 3685321074...3700886525 with 524288 loops 706 microseconds
|
||||
cpucycles observed persecond 3687745930...3695537208 with 1048576 loops 1412 microseconds
|
||||
```
|
||||
|
||||
`gcc202`,
|
||||
UltraSparc T5,
|
||||
Debian unstable (bookworm),
|
||||
Linux kernel 5.19.0-2-sparc64-smp:
|
||||
```
|
||||
cpucycles version 20230105
|
||||
cpucycles tracesetup 0 sparc64-rdtick precision 65 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 default-perfevent precision 386 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 442 scaling 3.599910 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 3799 scaling 3599.910000 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 3599910000
|
||||
cpucycles implementation sparc64-rdtick
|
||||
cpucycles median 73 +24+0+24+24+24+24+24+24+0+1+24+0+1+24+0+1+24+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0
|
||||
cpucycles observed persecond 2751500000...4258250000 with 4096 loops 5 microseconds
|
||||
cpucycles observed persecond 3289200000...4206875000 with 8192 loops 9 microseconds
|
||||
cpucycles observed persecond 3454789473...3900823530 with 16384 loops 18 microseconds
|
||||
cpucycles observed persecond 3452026315...3659888889 with 32768 loops 37 microseconds
|
||||
cpucycles observed persecond 3543770270...3650916667 with 65536 loops 73 microseconds
|
||||
cpucycles observed persecond 3567299319...3620662069 with 131072 loops 146 microseconds
|
||||
cpucycles observed persecond 3591373287...3618220690 with 262144 loops 291 microseconds
|
||||
cpucycles observed persecond 3597353344...3610774527 with 524288 loops 582 microseconds
|
||||
cpucycles observed persecond 3595899403...3603058071 with 1048576 loops 1172 microseconds
|
||||
```
|
||||
|
||||
IBM z15:
|
||||
```
|
||||
cpucycles version 20230106
|
||||
cpucycles tracesetup 0 s390x-stckf precision 250 scaling 1.269531 only32 0
|
||||
cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 272 scaling 5.200000 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 5400 scaling 5200.000000 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 5200000000
|
||||
cpucycles implementation s390x-stckf
|
||||
cpucycles median 48 +87+8+0-2+0+0+38-2+0+1-3+1+28+0+3-3+1+0+28+0-2+3+0-2+36+0+0+0+1+0+28+0-2+0+3-2+35+1+0-2+0+3+28+0-2+0+0-2+3+25+3+0-2+0+1+35+1+0+0-2+0+28+0
|
||||
cpucycles observed persecond 4948941176...5627733334 with 8192 loops 16 microseconds
|
||||
cpucycles observed persecond 4104125000...5515666667 with 16384 loops 7 microseconds
|
||||
cpucycles observed persecond 5047076923...5987818182 with 32768 loops 12 microseconds
|
||||
cpucycles observed persecond 5044846153...5475708334 with 65536 loops 25 microseconds
|
||||
cpucycles observed persecond 5141313725...5357428572 with 131072 loops 50 microseconds
|
||||
cpucycles observed persecond 5150892156...5257250000 with 262144 loops 101 microseconds
|
||||
cpucycles observed persecond 5183421568...5236549505 with 524288 loops 203 microseconds
|
||||
cpucycles observed persecond 5190282555...5216582717 with 1048576 loops 406 microseconds
|
||||
```
|
||||
@@ -0,0 +1,30 @@
|
||||
To download and unpack the latest version of libcpucycles:
|
||||
|
||||
wget -m https://cpucycles.cr.yp.to/libcpucycles-latest-version.txt
|
||||
version=$(cat cpucycles.cr.yp.to/libcpucycles-latest-version.txt)
|
||||
wget -m https://cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
|
||||
tar -xzf cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
|
||||
cd libcpucycles-$version
|
||||
|
||||
Then [install](install.html).
|
||||
|
||||
### Archives and changelog (reverse chronological)
|
||||
|
||||
[`libcpucycles-20230115.tar.gz`](libcpucycles-20230115.tar.gz) [browse](libcpucycles-20230115.html)
|
||||
|
||||
Update actual `cpucycles_version` behavior to match documentation.
|
||||
|
||||
[`libcpucycles-20230110.tar.gz`](libcpucycles-20230110.tar.gz) [browse](libcpucycles-20230110.html)
|
||||
|
||||
`doc/api.md`: Document `cpucycles_version()`.
|
||||
|
||||
Add `s390x-stckf` counter.
|
||||
|
||||
`cpucycles/default-perfevent.c`: Read into `int64_t` instead of `long long`.
|
||||
Add comment explaining issues with `PERF_FORMAT_TOTAL_TIME_RUNNING`.
|
||||
|
||||
`configure`: Improve `uname` handling.
|
||||
|
||||
`doc/api.md`: Update description of default frequency.
|
||||
|
||||
[`libcpucycles-20230105.tar.gz`](libcpucycles-20230105.tar.gz) [browse](libcpucycles-20230105.html)
|
||||
@@ -0,0 +1,91 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style type="text/css">
|
||||
html{overflow-y:scroll}
|
||||
body{font-family:sans-serif}
|
||||
p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
|
||||
li p{font-size:1.0em}
|
||||
blockquote p{font-size:1.0em}
|
||||
tt{font-size:1.2em}
|
||||
code{font-size:1.2em}
|
||||
h1{font-size:1.5em}
|
||||
h2{font-size:1.3em}
|
||||
h3{font-size:1.0em}
|
||||
h1 a{text-decoration:none}
|
||||
table{border-collapse:collapse}
|
||||
th,td{border:1px solid black}
|
||||
table a{text-decoration:none}
|
||||
table tr{font-size:0.9em;line-height:1.6em}
|
||||
.links a:hover{text-decoration:underline}
|
||||
.links a:active{text-decoration:underline}
|
||||
.links img{width:200px;padding-left:1em}
|
||||
.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
|
||||
.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
|
||||
.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
|
||||
min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
|
||||
font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
|
||||
.here{border-bottom:0px;background-color:#ffffff}
|
||||
.away{background-color:#125d0d;}
|
||||
.away a{text-decoration:none;display:block;color:#ffffff}
|
||||
.away a:hover,.away a:active{text-decoration:underline}
|
||||
.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
|
||||
</style>
|
||||
<title>
|
||||
API</title>
|
||||
</head>
|
||||
<body>
|
||||
<div class=headline>
|
||||
libcpucycles
|
||||
</div>
|
||||
<div class=nav>
|
||||
<div class="navt away"><a href=index.html>Intro</a>
|
||||
</div><div class="navt away"><a href=download.html>Download</a>
|
||||
</div><div class="navt away"><a href=install.html>Install</a>
|
||||
</div><div class="navt here">API
|
||||
</div><div class="navt away"><a href=counters.html>Counters</a>
|
||||
</div><div class="navt away"><a href=selection.html>Selection</a>
|
||||
</div><div class="navt away"><a href=security.html>Security</a>
|
||||
</div></div>
|
||||
<div class=main>
|
||||
<h3>NAME</h3>
|
||||
<p>cpucycles - count CPU cycles</p>
|
||||
<h3>SYNOPSIS</h3>
|
||||
<pre><code>#include <cpucycles.h>
|
||||
|
||||
long long count = cpucycles();
|
||||
long long persecond = cpucycles_persecond();
|
||||
const char *implementation = cpucycles_implementation();
|
||||
const char *version = cpucycles_version();
|
||||
</code></pre>
|
||||
<p>Link with <code>-lcpucycles</code>. Old systems may also need <code>-lrt</code>.</p>
|
||||
<h3>DESCRIPTION</h3>
|
||||
<p><code>cpucycles()</code> returns an estimate for the number of CPU cycles that have
|
||||
occurred since an unspecified time in the past (perhaps system boot,
|
||||
perhaps program startup).</p>
|
||||
<p>Accessing true cycle counters can be difficult on some CPUs and
|
||||
operating systems. <code>cpucycles()</code> does its best to produce accurate
|
||||
results, but selects a low-precision counter if the only other option is
|
||||
failure.</p>
|
||||
<p><code>cpucycles_persecond()</code> returns an estimate for the number of CPU cycles
|
||||
per second. This estimate comes from <code>/etc/cpucyclespersecond</code> if that
|
||||
file exists, otherwise from various OS mechanisms, otherwise from the
|
||||
<code>cpucyclespersecond</code> environment variable if that is set, otherwise
|
||||
2399987654.</p>
|
||||
<p><code>cpucycles_implementation()</code> returns the name of the counter in use:
|
||||
e.g., <code>"amd64-pmc"</code>.</p>
|
||||
<p><code>cpucycles_version()</code> returns the <code>libcpucycles</code> version number as a
|
||||
string: e.g., <code>"20230115"</code>. Results of <code>cpucycles_implementation()</code>
|
||||
should be interpreted relative to <code>cpucycles_version()</code>.</p>
|
||||
<p><code>cpucycles</code> is actually a function pointer. The first call to
|
||||
<code>cpucycles()</code> or <code>cpucycles_persecond()</code> or <code>cpucycles_implementation()</code>
|
||||
selects one of the available counters and updates the <code>cpucycles</code>
|
||||
pointer accordingly. Subsequent calls to <code>cpucycles()</code> are thread-safe.</p>
|
||||
<h3>SEE ALSO</h3>
|
||||
<p><strong>gettimeofday</strong>(2), <strong>clock_gettime</strong>(2)</p><hr><font size=1><b>Version:</b>
|
||||
This is version 2023.01.15 of the "API" web page.
|
||||
</font>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,456 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style type="text/css">
|
||||
html{overflow-y:scroll}
|
||||
body{font-family:sans-serif}
|
||||
p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
|
||||
li p{font-size:1.0em}
|
||||
blockquote p{font-size:1.0em}
|
||||
tt{font-size:1.2em}
|
||||
code{font-size:1.2em}
|
||||
h1{font-size:1.5em}
|
||||
h2{font-size:1.3em}
|
||||
h3{font-size:1.0em}
|
||||
h1 a{text-decoration:none}
|
||||
table{border-collapse:collapse}
|
||||
th,td{border:1px solid black}
|
||||
table a{text-decoration:none}
|
||||
table tr{font-size:0.9em;line-height:1.6em}
|
||||
.links a:hover{text-decoration:underline}
|
||||
.links a:active{text-decoration:underline}
|
||||
.links img{width:200px;padding-left:1em}
|
||||
.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
|
||||
.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
|
||||
.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
|
||||
min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
|
||||
font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
|
||||
.here{border-bottom:0px;background-color:#ffffff}
|
||||
.away{background-color:#125d0d;}
|
||||
.away a{text-decoration:none;display:block;color:#ffffff}
|
||||
.away a:hover,.away a:active{text-decoration:underline}
|
||||
.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
|
||||
</style>
|
||||
<title>
|
||||
Counters</title>
|
||||
</head>
|
||||
<body>
|
||||
<div class=headline>
|
||||
libcpucycles
|
||||
</div>
|
||||
<div class=nav>
|
||||
<div class="navt away"><a href=index.html>Intro</a>
|
||||
</div><div class="navt away"><a href=download.html>Download</a>
|
||||
</div><div class="navt away"><a href=install.html>Install</a>
|
||||
</div><div class="navt away"><a href=api.html>API</a>
|
||||
</div><div class="navt here">Counters
|
||||
</div><div class="navt away"><a href=selection.html>Selection</a>
|
||||
</div><div class="navt away"><a href=security.html>Security</a>
|
||||
</div></div>
|
||||
<div class=main>
|
||||
<p>Currently libcpucycles supports the following cycle counters. Some
|
||||
cycle counters are actually other forms of counters that libcpucycles
|
||||
scales to imitate a cycle counter. There is
|
||||
<a href="selection.html">separate documentation</a>
|
||||
for how libcpucycles makes a choice of cycle counter. See also
|
||||
<a href="security.html">security considerations</a> regarding enabling or disabling
|
||||
counters and regarding Turbo Boost.</p>
|
||||
<p><code>amd64-pmc</code>: Requires a 64-bit Intel/AMD platform. Requires the Linux
|
||||
perf_event interface. Accesses a cycle counter through RDPMC. Requires
|
||||
<code>/proc/sys/kernel/perf_event_paranoid</code> to be at most 2 for user-level
|
||||
RDPMC access. This counter runs at the clock frequency of the CPU core.</p>
|
||||
<p><code>amd64-tsc</code>, <code>amd64-tscasm</code>: Requires a 64-bit Intel/AMD platform.
|
||||
Requires RDTSC to be enabled, which it is by default. Uses RDTSC to
|
||||
access the CPU's time-stamp counter. On current CPUs, this is an
|
||||
off-core clock rather than a cycle counter, but it is typically a very
|
||||
fast off-core clock, making it adequate for seeing cycle counts if
|
||||
overclocking and underclocking are disabled. The difference between
|
||||
<code>tsc</code> and <code>tscasm</code> is that <code>tsc</code> uses the compiler's <code>__rdtsc()</code> while
|
||||
<code>tscasm</code> uses inline assembly.</p>
|
||||
<p><code>arm32-cortex</code>: Requires a 32-bit ARMv7-A platform. Uses
|
||||
<code>mrc p15, 0, %0, c9, c13, 0</code> to read the cycle counter. Requires user
|
||||
access to the cycle counter, which is not enabled by default but can be
|
||||
enabled under Linux via
|
||||
<a href="https://github.com/thoughtpolice/enable_arm_pmu">a kernel module</a>.
|
||||
This counter is natively 32 bits, but libcpucycles watches how the
|
||||
counter and <code>gettimeofday</code> increase to compute a 64-bit extension of the
|
||||
counter.</p>
|
||||
<p><code>arm64-pmc</code>: Requires a 64-bit ARMv8-A platform. Uses
|
||||
<code>mrs %0, PMCCNTR_EL0</code> to read the cycle counter. Requires user access
|
||||
to the cycle counter, which is not enabled by default but can be enabled
|
||||
under Linux via
|
||||
<a href="https://github.com/rdolbeau/enable_arm_pmu">a kernel module</a>.</p>
|
||||
<p><code>arm64-vct</code>: Requires a 64-bit ARMv8-A platform. Uses
|
||||
<code>mrs %0, CNTVCT_EL0</code> to read a "virtual count" timer. This is an
|
||||
off-core clock, typically running at 24MHz. Results are scaled by
|
||||
libcpucycles.</p>
|
||||
<p><code>mips64-cc</code>: Requires a 64-bit MIPS platform. (Maybe the same code would
|
||||
also work as <code>mips32-cc</code>, but this has not been tested yet.) Uses RDHWR
|
||||
to read the hardware cycle counter (hardware register 2 times a constant
|
||||
scale factor in hardware register 3). This counter is natively 32 bits,
|
||||
but libcpucycles watches how the counter and <code>gettimeofday</code> increase to
|
||||
compute a 64-bit extension of the counter.</p>
|
||||
<p><code>ppc32-mftb</code>: Requires a 32-bit PowerPC platform. Uses <code>mftb</code> and
|
||||
<code>mftbu</code> to read the "time base". This is an off-core clock, typically
|
||||
running at 24MHz.</p>
|
||||
<p><code>ppc64-mftb</code>: Requires a 64-bit PowerPC platform. Uses <code>mftb</code> and
|
||||
<code>mftbu</code> to read the "time base". This is an off-core clock, typically
|
||||
running at 24MHz.</p>
|
||||
<p><code>riscv32-rdcycle</code>: Requires a 32-bit RISC-V platform. Uses <code>rdcycle</code>
|
||||
and <code>rdcycleh</code> to read a cycle counter.</p>
|
||||
<p><code>riscv64-rdcycle</code>: Requires a 64-bit RISC-V platform. Uses <code>rdcycle</code>
|
||||
to read a cycle counter.</p>
|
||||
<p><code>s390x-stckf</code>: Requires a 64-bit z/Architecture platform. Uses <code>stckf</code>
|
||||
to read the TOD clock, which is documented to run at 4096MHz. On the
|
||||
z15, this looks like a doubling of an off-core 2048MHz clock. Results
|
||||
are scaled by libcpucycles.</p>
|
||||
<p><code>sparc64-rdtick</code>: Requires a 64-bit SPARC platform. Uses <code>rd %tick</code>
|
||||
to read a cycle counter.</p>
|
||||
<p><code>x86-tsc</code>, <code>x86-tscasm</code>: Same as <code>amd64-tsc</code> and <code>amd64-tscasm</code>, but
|
||||
for 32-bit Intel/AMD platforms instead of 64-bit Intel/AMD platforms.</p>
|
||||
<p><code>default-gettimeofday</code>: Reasonably portable. Resolution is limited to 1
|
||||
microsecond. Results are scaled by libcpucycles.</p>
|
||||
<p><code>default-mach</code>: Requires an OS with <code>mach_absolute_time()</code>. Typically
|
||||
runs at 24MHz. Results are scaled by libcpucycles.</p>
|
||||
<p><code>default-monotonic</code>: Requires <code>CLOCK_MONOTONIC</code>. Reasonably portable,
|
||||
although might fail on older systems where <code>default-gettimeofday</code> works.
|
||||
Resolution is limited to 1 nanosecond. Can be almost as good as a cycle
|
||||
counter, or orders of magnitude worse, depending on the OS and CPU.
|
||||
Results are scaled by libcpucycles.</p>
|
||||
<p><code>default-perfevent</code>: Requires the Linux <code>perf_event</code> interface, and a
|
||||
CPU where <code>perf_event</code> supports <code>PERF_COUNT_HW_CPU_CYCLES</code>. Similar
|
||||
variations in quality to <code>default-monotonic</code>, without the 1-nanosecond
|
||||
limitation.</p>
|
||||
<p><code>default-zero</code>: The horrifying last resort if nothing else works.</p>
|
||||
<h2>Examples</h2>
|
||||
<p>These are examples of <code>cpucycles-info</code> output on various machines. The
|
||||
machines named <code>gcc*</code> are from the
|
||||
<a href="https://gcc.gnu.org/wiki/CompileFarm">GCC Compile Farm</a>.</p>
|
||||
<p>A <code>median</code> line saying, e.g., <code>47 +47+28+0+2-5+0+2-5...</code> means that the
|
||||
differences between adjacent cycle counts were 47+47, 47+28, 47+0, 47+2,
|
||||
47−5, 47+0, 47+2, 47−5, etc., with median difference 47. The first few
|
||||
differences are typically larger because of cache effects.</p>
|
||||
<p><code>pi3aplus</code>,
|
||||
Broadcom BCM2837B0:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 arm64-pmc precision 9 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 arm64-vct precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 2 default-perfevent precision 189 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 4 default-monotonic precision 272 scaling 1.400000 only32 0
|
||||
cpucycles tracesetup 5 default-gettimeofday precision 1600 scaling 1400.000000 only32 0
|
||||
cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 1400000000
|
||||
cpucycles implementation arm64-pmc
|
||||
cpucycles median 10 +10+8+3+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
|
||||
cpucycles observed persecond 1032000000...4224666667 with 1024 loops 4 microseconds
|
||||
cpucycles observed persecond 1286000000...1756000000 with 2048 loops 7 microseconds
|
||||
cpucycles observed persecond 1368266666...1598000000 with 4096 loops 14 microseconds
|
||||
cpucycles observed persecond 1366700000...1473428572 with 8192 loops 29 microseconds
|
||||
cpucycles observed persecond 1366100000...1417534483 with 16384 loops 59 microseconds
|
||||
cpucycles observed persecond 1332739837...1357132232 with 32768 loops 122 microseconds
|
||||
cpucycles observed persecond 1354483471...1366945834 with 65536 loops 241 microseconds
|
||||
cpucycles observed persecond 1385684989...1392195330 with 131072 loops 472 microseconds
|
||||
cpucycles observed persecond 1347223021...1350328528 with 262144 loops 972 microseconds
|
||||
cpucycles observed persecond 1375460125...1377069853 with 524288 loops 1905 microseconds
|
||||
cpucycles observed persecond 1376527697...1377335961 with 1048576 loops 3808 microseconds
|
||||
</code></pre>
|
||||
<p><code>bblack</code>,
|
||||
TI Sitara XAM3359AZCZ100:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 arm32-cortex precision 8 scaling 1.000000 only32 1
|
||||
cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 1283 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 1200 scaling 1000.000000 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 1000000000
|
||||
cpucycles implementation arm32-cortex
|
||||
cpucycles median 1260 +1506+62+31+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+13+7+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
|
||||
cpucycles observed persecond 622181818...2101888889 with 1024 loops 10 microseconds
|
||||
cpucycles observed persecond 806133333...1492615385 with 2048 loops 14 microseconds
|
||||
cpucycles observed persecond 879880000...1232565218 with 4096 loops 24 microseconds
|
||||
cpucycles observed persecond 939577777...1130581396 with 8192 loops 44 microseconds
|
||||
cpucycles observed persecond 956954022...1050047059 with 16384 loops 86 microseconds
|
||||
cpucycles observed persecond 982878542...1020685715 with 32768 loops 246 microseconds
|
||||
cpucycles observed persecond 988105105...1012217523 with 65536 loops 332 microseconds
|
||||
cpucycles observed persecond 993752077...1007159723 with 131072 loops 721 microseconds
|
||||
cpucycles observed persecond 995364296...1004009448 with 262144 loops 1377 microseconds
|
||||
cpucycles observed persecond 998216306...1001821536 with 524288 loops 2685 microseconds
|
||||
cpucycles observed persecond 998991848...1000914196 with 1048576 loops 5397 microseconds
|
||||
</code></pre>
|
||||
<p><code>hiphop</code>,
|
||||
Intel Xeon E3-1220 v3:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 amd64-pmc precision 40 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 amd64-tsc precision 124 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 amd64-tscasm precision 124 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-perfevent precision 160 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 5 default-monotonic precision 272 scaling 3.100000 only32 0
|
||||
cpucycles tracesetup 6 default-gettimeofday precision 3300 scaling 3100.000000 only32 0
|
||||
cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 3100000000
|
||||
cpucycles implementation amd64-pmc
|
||||
cpucycles median 44 +38+23+23+23-4+0-4+0-4+0-4+0+10-4-2+1-4+1-4+1+17+1-4+1-4+1-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4+0-4
|
||||
cpucycles observed persecond 2066500000...4235000000 with 8192 loops 3 microseconds
|
||||
cpucycles observed persecond 2760833333...4200250000 with 16384 loops 5 microseconds
|
||||
cpucycles observed persecond 2743416666...3313100000 with 32768 loops 11 microseconds
|
||||
cpucycles observed persecond 2986227272...3295000000 with 65536 loops 21 microseconds
|
||||
cpucycles observed persecond 3052069767...3206073171 with 131072 loops 42 microseconds
|
||||
cpucycles observed persecond 3050395348...3125523810 with 262144 loops 85 microseconds
|
||||
cpucycles observed persecond 3085123529...3123059524 with 524288 loops 169 microseconds
|
||||
cpucycles observed persecond 3084561764...3103434912 with 1048576 loops 339 microseconds
|
||||
</code></pre>
|
||||
<p><code>nucnuc</code>,
|
||||
Intel Pentium N3700:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 amd64-pmc precision 26 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 amd64-tsc precision 120 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 amd64-tscasm precision 120 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-perfevent precision 427 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 5 default-monotonic precision 320 scaling 1.600000 only32 0
|
||||
cpucycles tracesetup 6 default-gettimeofday precision 1800 scaling 1600.000000 only32 0
|
||||
cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 1600000000
|
||||
cpucycles implementation amd64-pmc
|
||||
cpucycles median 66 +12+12+14+14-1-1+0-1+0-1+0-1+0+1-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+0-1+0-2+0-1+0-1+1-1+0-2-1-1+0-1+0-1+0-2+0-1+2+0-1+0-1+0+0-1
|
||||
cpucycles observed persecond 1060500000...2325000000 with 2048 loops 3 microseconds
|
||||
cpucycles observed persecond 1387166666...2208250000 with 4096 loops 5 microseconds
|
||||
cpucycles observed persecond 1376083333...1705500000 with 8192 loops 11 microseconds
|
||||
cpucycles observed persecond 1495727272...1671800000 with 16384 loops 21 microseconds
|
||||
cpucycles observed persecond 1563428571...1655100000 with 32768 loops 41 microseconds
|
||||
cpucycles observed persecond 1580807228...1626234568 with 65536 loops 82 microseconds
|
||||
cpucycles observed persecond 1589539393...1612619632 with 131072 loops 164 microseconds
|
||||
cpucycles observed persecond 1598841463...1610230062 with 262144 loops 327 microseconds
|
||||
cpucycles observed persecond 1564336810...1569988042 with 524288 loops 670 microseconds
|
||||
cpucycles observed persecond 1599759725...1602608098 with 1048576 loops 1310 microseconds
|
||||
</code></pre>
|
||||
<p><code>saber214</code>,
|
||||
AMD FX-8350:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 amd64-pmc precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 1 amd64-tsc precision 167 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 amd64-tscasm precision 168 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-perfevent precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 5 default-monotonic precision 376 scaling 4.013452 only32 0
|
||||
cpucycles tracesetup 6 default-gettimeofday precision 4213 scaling 4013.452000 only32 0
|
||||
cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 4013452000
|
||||
cpucycles implementation amd64-tsc
|
||||
cpucycles median 77 +87-2+21+7+4+1+0+2-2-7-4+0+1+4-2+3+1-2-2+5-6+2+2+2+2+1-1-1+0-4+0-1-1-1-2+3-1-1+2-2+0+0+2+0+0+2-2-2+1-1-2+2-5+2+0+2+0+1+0+3-2-1-1
|
||||
cpucycles observed persecond 2767500000...5759000000 with 4096 loops 3 microseconds
|
||||
cpucycles observed persecond 3426000000...4893800000 with 8192 loops 6 microseconds
|
||||
cpucycles observed persecond 3724076923...4446363637 with 16384 loops 12 microseconds
|
||||
cpucycles observed persecond 3977833333...4363318182 with 32768 loops 23 microseconds
|
||||
cpucycles observed persecond 3984854166...4168739131 with 65536 loops 47 microseconds
|
||||
cpucycles observed persecond 3981709923...4048193799 with 131072 loops 130 microseconds
|
||||
cpucycles observed persecond 3982716417...4026914573 with 262144 loops 200 microseconds
|
||||
cpucycles observed persecond 4001637602...4025136987 with 524288 loops 366 microseconds
|
||||
cpucycles observed persecond 4007411111...4018600248 with 1048576 loops 809 microseconds
|
||||
</code></pre>
|
||||
<p><code>gcc14</code>,
|
||||
Intel Xeon E5-2620 v3,
|
||||
Debian testing (bookworm),
|
||||
Linux kernel 6.0.0-6-amd64:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 amd64-pmc precision 41 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 amd64-tsc precision 148 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 amd64-tscasm precision 148 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-perfevent precision 159 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 4 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 5 default-monotonic precision 289 scaling 3.200000 only32 0
|
||||
cpucycles tracesetup 6 default-gettimeofday precision 3400 scaling 3200.000000 only32 0
|
||||
cpucycles tracesetup 7 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 3200000000
|
||||
cpucycles implementation amd64-pmc
|
||||
cpucycles median 47 +47+28+0+2-5+0+2-5+16+2-5+0+2-5+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0+1-4+0
|
||||
cpucycles observed persecond 1653800000...2819333334 with 8192 loops 4 microseconds
|
||||
cpucycles observed persecond 1832111111...2389285715 with 16384 loops 8 microseconds
|
||||
cpucycles observed persecond 1936058823...2207200000 with 32768 loops 16 microseconds
|
||||
cpucycles observed persecond 2052843750...2196200000 with 65536 loops 31 microseconds
|
||||
cpucycles observed persecond 2050750000...2120048388 with 131072 loops 63 microseconds
|
||||
cpucycles observed persecond 2081896825...2117048388 with 262144 loops 125 microseconds
|
||||
cpucycles observed persecond 2089478087...2107044177 with 524288 loops 250 microseconds
|
||||
cpucycles observed persecond 2093343313...2102124249 with 1048576 loops 500 microseconds
|
||||
</code></pre>
|
||||
<p><code>gcc23</code>,
|
||||
Cavium Octeon II V0.1,
|
||||
Debian 8.11,
|
||||
Linux kernel 4.1.4:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 mips64-cc precision 24 scaling 1.000000 only32 1
|
||||
cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 46702 scaling 2.399988 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 45799 scaling 2399.987654 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 2399987654
|
||||
cpucycles implementation mips64-cc
|
||||
cpucycles median 2177 +828+17+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0
|
||||
cpucycles observed persecond 641900000...1845125000 with 1024 loops 9 microseconds
|
||||
cpucycles observed persecond 745357142...1352083334 with 2048 loops 13 microseconds
|
||||
cpucycles observed persecond 809826086...1162333334 with 4096 loops 22 microseconds
|
||||
cpucycles observed persecond 897717948...1104405406 with 8192 loops 38 microseconds
|
||||
cpucycles observed persecond 957467532...1059986667 with 16384 loops 76 microseconds
|
||||
cpucycles observed persecond 973102189...1029777778 with 32768 loops 136 microseconds
|
||||
cpucycles observed persecond 986518656...1015830828 with 65536 loops 267 microseconds
|
||||
cpucycles observed persecond 993452830...1008166667 with 131072 loops 529 microseconds
|
||||
cpucycles observed persecond 996036966...1003403609 with 262144 loops 1054 microseconds
|
||||
cpucycles observed persecond 984706378...1001682630 with 524288 loops 2131 microseconds
|
||||
cpucycles observed persecond 992585292...1001178580 with 1048576 loops 4296 microseconds
|
||||
</code></pre>
|
||||
<p><code>gcc45</code>,
|
||||
AMD Athlon II X4 640,
|
||||
Debian 8.11,
|
||||
Linux kernel 3.16.0-11-686-pae:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 x86-tsc precision 199 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 x86-tscasm precision 199 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 default-perfevent precision 170 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 4 default-monotonic precision 941 scaling 3.000000 only32 0
|
||||
cpucycles tracesetup 5 default-gettimeofday precision 3200 scaling 3000.000000 only32 0
|
||||
cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 3000000000
|
||||
cpucycles implementation default-perfevent
|
||||
cpucycles median 72 +12+0+0+0+0+0+0+0+5+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+2+0+0+0+0+0+0+0+1+0+0+0+0+0+0
|
||||
cpucycles observed persecond 541500000...1812000000 with 1024 loops 3 microseconds
|
||||
cpucycles observed persecond 712333333...1212250000 with 2048 loops 5 microseconds
|
||||
cpucycles observed persecond 1193285714...1733600000 with 4096 loops 6 microseconds
|
||||
cpucycles observed persecond 1689176470...1804562500 with 8192 loops 33 microseconds
|
||||
cpucycles observed persecond 1713074626...1770600000 with 16384 loops 66 microseconds
|
||||
cpucycles observed persecond 1765107692...1795140625 with 32768 loops 129 microseconds
|
||||
cpucycles observed persecond 1785369649...1800603922 with 65536 loops 256 microseconds
|
||||
cpucycles observed persecond 1781377862...1796288462 with 131072 loops 261 microseconds
|
||||
cpucycles observed persecond 1772647398...1778247827 with 262144 loops 691 microseconds
|
||||
cpucycles observed persecond 1789670493...1794149598 with 524288 loops 870 microseconds
|
||||
cpucycles observed persecond 1860276211...1861561332 with 1048576 loops 3156 microseconds
|
||||
</code></pre>
|
||||
<p><code>gcc92</code>,
|
||||
SiFive Freedom U740,
|
||||
Ubuntu 22.04,
|
||||
Linux kernel 5.15.0-1014-generic:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 riscv64-rdcycle precision 8 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 default-perfevent precision 3024 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 2599 scaling 2.399988 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 2599 scaling 2399.987654 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 2399987654
|
||||
cpucycles implementation riscv64-rdcycle
|
||||
cpucycles median 8 +33+27+1+1+1+1+0+0+0+22+0+0+0+0+0+0+0+628+0+0+0+7+0+0+0+145+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+158+0+0+0+0+0+0+0+22+0+0+0+0+0+0+0+22+0+0+0+0+0
|
||||
cpucycles observed persecond 530250000...1978000000 with 1024 loops 3 microseconds
|
||||
cpucycles observed persecond 831000000...1915666667 with 2048 loops 4 microseconds
|
||||
cpucycles observed persecond 1055750000...1689500000 with 4096 loops 7 microseconds
|
||||
cpucycles observed persecond 1045562500...1305428572 with 8192 loops 15 microseconds
|
||||
cpucycles observed persecond 1102700000...1236357143 with 16384 loops 29 microseconds
|
||||
cpucycles observed persecond 1176053571...1247444445 with 32768 loops 55 microseconds
|
||||
cpucycles observed persecond 1173321428...1209127273 with 65536 loops 111 microseconds
|
||||
cpucycles observed persecond 1187805429...1205210046 with 131072 loops 220 microseconds
|
||||
cpucycles observed persecond 1192415909...1201157535 with 262144 loops 439 microseconds
|
||||
cpucycles observed persecond 1194694760...1199247717 with 524288 loops 877 microseconds
|
||||
cpucycles observed persecond 1194656004...1197023034 with 1048576 loops 1781 microseconds
|
||||
</code></pre>
|
||||
<p><code>gcc103</code>,
|
||||
Apple M1 (Icestorm-M1 + Firestorm-M1),
|
||||
Debian unstable (bookworm),
|
||||
Linux kernel 6.0.0-rc5-asahi-00001-gc62bd3fe430f:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 arm64-pmc precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 1 arm64-vct precision 186 scaling 86.000000 only32 0
|
||||
cpucycles tracesetup 2 default-perfevent precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 4 default-monotonic precision 285 scaling 2.064000 only32 0
|
||||
cpucycles tracesetup 5 default-gettimeofday precision 2264 scaling 2064.000000 only32 0
|
||||
cpucycles tracesetup 6 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 2064000000
|
||||
cpucycles implementation arm64-vct
|
||||
cpucycles median 0 +0+86+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+86+0+0+0+0+0+0+0+0
|
||||
cpucycles observed persecond 1784500000...3655000000 with 8192 loops 3 microseconds
|
||||
cpucycles observed persecond 1773750000...2393666667 with 16384 loops 7 microseconds
|
||||
cpucycles observed persecond 1897733333...2222769231 with 32768 loops 14 microseconds
|
||||
cpucycles observed persecond 1951310344...2114962963 with 65536 loops 28 microseconds
|
||||
cpucycles observed persecond 2024071428...2107000000 with 131072 loops 55 microseconds
|
||||
cpucycles observed persecond 2041531531...2082935780 with 262144 loops 110 microseconds
|
||||
cpucycles observed persecond 2051158371...2071461188 with 524288 loops 220 microseconds
|
||||
cpucycles observed persecond 2058539682...2068309795 with 1048576 loops 440 microseconds
|
||||
</code></pre>
|
||||
<p><code>gcc112</code> (<code>gcc2-power8</code>),
|
||||
IBM POWER8E,
|
||||
CentOS 7.9 AltArch,
|
||||
Linux kernel 3.10.0-1127.13.1.el7.ppc64le:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 ppc64-mftb precision 251 scaling 7.207031 only32 0
|
||||
cpucycles tracesetup 1 default-perfevent precision 295 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 536 scaling 3.690000 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 3890 scaling 3690.000000 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 3690000000
|
||||
cpucycles implementation ppc64-mftb
|
||||
cpucycles median 195 +2969-8+14+0-8+7-8-7+7+6-7-1+0-1+0+7+7-15+7-1-7+6+0+0-8+0+6+0-8+7+0+7-8-8-7-1+7-8+7+0-8+0+14-8-7+6+0-8+7+7-15+0-1+0-1+14+0-15+14+0-1+7+0
|
||||
cpucycles observed persecond 2603750000...5510000000 with 2048 loops 3 microseconds
|
||||
cpucycles observed persecond 3430500000...6052250000 with 4096 loops 5 microseconds
|
||||
cpucycles observed persecond 3411333333...4457500000 with 8192 loops 11 microseconds
|
||||
cpucycles observed persecond 3548695652...4060333334 with 16384 loops 22 microseconds
|
||||
cpucycles observed persecond 3624977777...3876534884 with 32768 loops 44 microseconds
|
||||
cpucycles observed persecond 3621855555...3745363637 with 65536 loops 89 microseconds
|
||||
cpucycles observed persecond 3660157303...3722227273 with 131072 loops 177 microseconds
|
||||
cpucycles observed persecond 3680471751...3711622160 with 262144 loops 353 microseconds
|
||||
cpucycles observed persecond 3685321074...3700886525 with 524288 loops 706 microseconds
|
||||
cpucycles observed persecond 3687745930...3695537208 with 1048576 loops 1412 microseconds
|
||||
</code></pre>
|
||||
<p><code>gcc202</code>,
|
||||
UltraSparc T5,
|
||||
Debian unstable (bookworm),
|
||||
Linux kernel 5.19.0-2-sparc64-smp:</p>
|
||||
<pre><code>cpucycles version 20230105
|
||||
cpucycles tracesetup 0 sparc64-rdtick precision 65 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 1 default-perfevent precision 386 scaling 1.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 442 scaling 3.599910 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 3799 scaling 3599.910000 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 3599910000
|
||||
cpucycles implementation sparc64-rdtick
|
||||
cpucycles median 73 +24+0+24+24+24+24+24+24+0+1+24+0+1+24+0+1+24+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+0+1+0+0+0+0+0+0+0+0+0+0+0+0+0
|
||||
cpucycles observed persecond 2751500000...4258250000 with 4096 loops 5 microseconds
|
||||
cpucycles observed persecond 3289200000...4206875000 with 8192 loops 9 microseconds
|
||||
cpucycles observed persecond 3454789473...3900823530 with 16384 loops 18 microseconds
|
||||
cpucycles observed persecond 3452026315...3659888889 with 32768 loops 37 microseconds
|
||||
cpucycles observed persecond 3543770270...3650916667 with 65536 loops 73 microseconds
|
||||
cpucycles observed persecond 3567299319...3620662069 with 131072 loops 146 microseconds
|
||||
cpucycles observed persecond 3591373287...3618220690 with 262144 loops 291 microseconds
|
||||
cpucycles observed persecond 3597353344...3610774527 with 524288 loops 582 microseconds
|
||||
cpucycles observed persecond 3595899403...3603058071 with 1048576 loops 1172 microseconds
|
||||
</code></pre>
|
||||
<p>IBM z15:</p>
|
||||
<pre><code>cpucycles version 20230106
|
||||
cpucycles tracesetup 0 s390x-stckf precision 250 scaling 1.269531 only32 0
|
||||
cpucycles tracesetup 1 default-perfevent precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 2 default-mach precision 0 scaling 0.000000 only32 0
|
||||
cpucycles tracesetup 3 default-monotonic precision 272 scaling 5.200000 only32 0
|
||||
cpucycles tracesetup 4 default-gettimeofday precision 5400 scaling 5200.000000 only32 0
|
||||
cpucycles tracesetup 5 default-zero precision 0 scaling 0.000000 only32 0
|
||||
cpucycles persecond 5200000000
|
||||
cpucycles implementation s390x-stckf
|
||||
cpucycles median 48 +87+8+0-2+0+0+38-2+0+1-3+1+28+0+3-3+1+0+28+0-2+3+0-2+36+0+0+0+1+0+28+0-2+0+3-2+35+1+0-2+0+3+28+0-2+0+0-2+3+25+3+0-2+0+1+35+1+0+0-2+0+28+0
|
||||
cpucycles observed persecond 4948941176...5627733334 with 8192 loops 16 microseconds
|
||||
cpucycles observed persecond 4104125000...5515666667 with 16384 loops 7 microseconds
|
||||
cpucycles observed persecond 5047076923...5987818182 with 32768 loops 12 microseconds
|
||||
cpucycles observed persecond 5044846153...5475708334 with 65536 loops 25 microseconds
|
||||
cpucycles observed persecond 5141313725...5357428572 with 131072 loops 50 microseconds
|
||||
cpucycles observed persecond 5150892156...5257250000 with 262144 loops 101 microseconds
|
||||
cpucycles observed persecond 5183421568...5236549505 with 524288 loops 203 microseconds
|
||||
cpucycles observed persecond 5190282555...5216582717 with 1048576 loops 406 microseconds
|
||||
</code></pre><hr><font size=1><b>Version:</b>
|
||||
This is version 2023.01.06 of the "Counters" web page.
|
||||
</font>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,75 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style type="text/css">
|
||||
html{overflow-y:scroll}
|
||||
body{font-family:sans-serif}
|
||||
p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
|
||||
li p{font-size:1.0em}
|
||||
blockquote p{font-size:1.0em}
|
||||
tt{font-size:1.2em}
|
||||
code{font-size:1.2em}
|
||||
h1{font-size:1.5em}
|
||||
h2{font-size:1.3em}
|
||||
h3{font-size:1.0em}
|
||||
h1 a{text-decoration:none}
|
||||
table{border-collapse:collapse}
|
||||
th,td{border:1px solid black}
|
||||
table a{text-decoration:none}
|
||||
table tr{font-size:0.9em;line-height:1.6em}
|
||||
.links a:hover{text-decoration:underline}
|
||||
.links a:active{text-decoration:underline}
|
||||
.links img{width:200px;padding-left:1em}
|
||||
.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
|
||||
.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
|
||||
.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
|
||||
min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
|
||||
font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
|
||||
.here{border-bottom:0px;background-color:#ffffff}
|
||||
.away{background-color:#125d0d;}
|
||||
.away a{text-decoration:none;display:block;color:#ffffff}
|
||||
.away a:hover,.away a:active{text-decoration:underline}
|
||||
.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
|
||||
</style>
|
||||
<title>
|
||||
Download</title>
|
||||
</head>
|
||||
<body>
|
||||
<div class=headline>
|
||||
libcpucycles
|
||||
</div>
|
||||
<div class=nav>
|
||||
<div class="navt away"><a href=index.html>Intro</a>
|
||||
</div><div class="navt here">Download
|
||||
</div><div class="navt away"><a href=install.html>Install</a>
|
||||
</div><div class="navt away"><a href=api.html>API</a>
|
||||
</div><div class="navt away"><a href=counters.html>Counters</a>
|
||||
</div><div class="navt away"><a href=selection.html>Selection</a>
|
||||
</div><div class="navt away"><a href=security.html>Security</a>
|
||||
</div></div>
|
||||
<div class=main>
|
||||
<p>To download and unpack the latest version of libcpucycles:</p>
|
||||
<pre><code> wget -m https://cpucycles.cr.yp.to/libcpucycles-latest-version.txt
|
||||
version=$(cat cpucycles.cr.yp.to/libcpucycles-latest-version.txt)
|
||||
wget -m https://cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
|
||||
tar -xzf cpucycles.cr.yp.to/libcpucycles-$version.tar.gz
|
||||
cd libcpucycles-$version
|
||||
</code></pre>
|
||||
<p>Then <a href="install.html">install</a>.</p>
|
||||
<h3>Archives and changelog (reverse chronological)</h3>
|
||||
<p><a href="libcpucycles-20230115.tar.gz"><code>libcpucycles-20230115.tar.gz</code></a> <a href="libcpucycles-20230115.html">browse</a></p>
|
||||
<p>Update actual <code>cpucycles_version</code> behavior to match documentation.</p>
|
||||
<p><a href="libcpucycles-20230110.tar.gz"><code>libcpucycles-20230110.tar.gz</code></a> <a href="libcpucycles-20230110.html">browse</a></p>
|
||||
<p><code>doc/api.md</code>: Document <code>cpucycles_version()</code>.</p>
|
||||
<p>Add <code>s390x-stckf</code> counter.</p>
|
||||
<p><code>cpucycles/default-perfevent.c</code>: Read into <code>int64_t</code> instead of <code>long long</code>.
|
||||
Add comment explaining issues with <code>PERF_FORMAT_TOTAL_TIME_RUNNING</code>.</p>
|
||||
<p><code>configure</code>: Improve <code>uname</code> handling.</p>
|
||||
<p><code>doc/api.md</code>: Update description of default frequency.</p>
|
||||
<p><a href="libcpucycles-20230105.tar.gz"><code>libcpucycles-20230105.tar.gz</code></a> <a href="libcpucycles-20230105.html">browse</a></p><hr><font size=1><b>Version:</b>
|
||||
This is version 2023.01.15 of the "Download" web page.
|
||||
</font>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,88 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style type="text/css">
|
||||
html{overflow-y:scroll}
|
||||
body{font-family:sans-serif}
|
||||
p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
|
||||
li p{font-size:1.0em}
|
||||
blockquote p{font-size:1.0em}
|
||||
tt{font-size:1.2em}
|
||||
code{font-size:1.2em}
|
||||
h1{font-size:1.5em}
|
||||
h2{font-size:1.3em}
|
||||
h3{font-size:1.0em}
|
||||
h1 a{text-decoration:none}
|
||||
table{border-collapse:collapse}
|
||||
th,td{border:1px solid black}
|
||||
table a{text-decoration:none}
|
||||
table tr{font-size:0.9em;line-height:1.6em}
|
||||
.links a:hover{text-decoration:underline}
|
||||
.links a:active{text-decoration:underline}
|
||||
.links img{width:200px;padding-left:1em}
|
||||
.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
|
||||
.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
|
||||
.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
|
||||
min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
|
||||
font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
|
||||
.here{border-bottom:0px;background-color:#ffffff}
|
||||
.away{background-color:#125d0d;}
|
||||
.away a{text-decoration:none;display:block;color:#ffffff}
|
||||
.away a:hover,.away a:active{text-decoration:underline}
|
||||
.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
|
||||
</style>
|
||||
<title>
|
||||
Intro</title>
|
||||
</head>
|
||||
<body>
|
||||
<div class=headline>
|
||||
libcpucycles
|
||||
</div>
|
||||
<div class=nav>
|
||||
<div class="navt here">Intro
|
||||
</div><div class="navt away"><a href=download.html>Download</a>
|
||||
</div><div class="navt away"><a href=install.html>Install</a>
|
||||
</div><div class="navt away"><a href=api.html>API</a>
|
||||
</div><div class="navt away"><a href=counters.html>Counters</a>
|
||||
</div><div class="navt away"><a href=selection.html>Selection</a>
|
||||
</div><div class="navt away"><a href=security.html>Security</a>
|
||||
</div></div>
|
||||
<div class=main>
|
||||
<p>libcpucycles is a public-domain microlibrary for counting CPU cycles.
|
||||
Cycle counts are not as detailed as
|
||||
<a href="https://gamozolabs.github.io/metrology/2019/08/19/sushi_roll.html">Falk diagrams</a>
|
||||
but are the most precise timers available to typical software; they are
|
||||
central tools used in understanding and improving software performance.</p>
|
||||
<p>The libcpucycles <a href="api.html">API</a> is simple: include <code><cpucycles.h></code>, call
|
||||
<code>cpucycles()</code> to receive a <code>long long</code> whenever desired, and link with
|
||||
<code>-lcpucycles</code>.</p>
|
||||
<p><a href="counters.html">Internally</a>, libcpucycles understands machine-level
|
||||
cycle counters for amd64 (both PMC and TSC), arm32, arm64 (both PMC and
|
||||
VCT), mips64, ppc32, ppc64, riscv32, riscv64, s390x, sparc64, and x86.
|
||||
libcpucycles also understands four OS-level mechanisms, which give
|
||||
varying levels of accuracy: <code>mach_absolute_time</code>, <code>perf_event</code>,
|
||||
<code>CLOCK_MONOTONIC</code>, and, as a fallback, microsecond-resolution
|
||||
<code>gettimeofday</code>.</p>
|
||||
<p>When the program first calls <code>cpucycles()</code>, libcpucycles automatically
|
||||
benchmarks the available mechanisms and <a href="selection.html">selects</a> the
|
||||
mechanism that does the best job. Subsequent <code>cpucycles()</code> calls are
|
||||
thread-safe and very fast. An accompanying <code>cpucycles-info</code> program
|
||||
prints a summary of cycle-counter accuracy.</p>
|
||||
<p>For comparison, there is a simple-sounding <code>__rdtsc()</code> API provided by
|
||||
compilers, but this works only on Intel/AMD CPUs and is generally noisier
|
||||
than PMC. There is a <code>__builtin_readcyclecounter()</code> that works on more
|
||||
CPUs, but this works only with <code>clang</code> and has the same noise problems.
|
||||
Both of these mechanisms put the burden on the caller to figure out what
|
||||
can be done on other CPUs. Various packages include their own more
|
||||
portable abstraction layers for counting cycles (see, e.g., FFTW's
|
||||
<a href="https://github.com/FFTW/fftw3/blob/master/kernel/cycle.h"><code>cycle.h</code></a>,
|
||||
used to automatically select from among multiple implementations
|
||||
provided by FFTW), but this creates per-package effort to keep up with
|
||||
the latest cycle counters. The goal of libcpucycles is to provide
|
||||
state-of-the-art cycle counting centrally for all packages to use.</p><hr><font size=1><b>Version:</b>
|
||||
This is version 2023.01.06 of the "Intro" web page.
|
||||
</font>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,101 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style type="text/css">
|
||||
html{overflow-y:scroll}
|
||||
body{font-family:sans-serif}
|
||||
p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
|
||||
li p{font-size:1.0em}
|
||||
blockquote p{font-size:1.0em}
|
||||
tt{font-size:1.2em}
|
||||
code{font-size:1.2em}
|
||||
h1{font-size:1.5em}
|
||||
h2{font-size:1.3em}
|
||||
h3{font-size:1.0em}
|
||||
h1 a{text-decoration:none}
|
||||
table{border-collapse:collapse}
|
||||
th,td{border:1px solid black}
|
||||
table a{text-decoration:none}
|
||||
table tr{font-size:0.9em;line-height:1.6em}
|
||||
.links a:hover{text-decoration:underline}
|
||||
.links a:active{text-decoration:underline}
|
||||
.links img{width:200px;padding-left:1em}
|
||||
.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
|
||||
.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
|
||||
.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
|
||||
min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
|
||||
font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
|
||||
.here{border-bottom:0px;background-color:#ffffff}
|
||||
.away{background-color:#125d0d;}
|
||||
.away a{text-decoration:none;display:block;color:#ffffff}
|
||||
.away a:hover,.away a:active{text-decoration:underline}
|
||||
.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
|
||||
</style>
|
||||
<title>
|
||||
Install</title>
|
||||
</head>
|
||||
<body>
|
||||
<div class=headline>
|
||||
libcpucycles
|
||||
</div>
|
||||
<div class=nav>
|
||||
<div class="navt away"><a href=index.html>Intro</a>
|
||||
</div><div class="navt away"><a href=download.html>Download</a>
|
||||
</div><div class="navt here">Install
|
||||
</div><div class="navt away"><a href=api.html>API</a>
|
||||
</div><div class="navt away"><a href=counters.html>Counters</a>
|
||||
</div><div class="navt away"><a href=selection.html>Selection</a>
|
||||
</div><div class="navt away"><a href=security.html>Security</a>
|
||||
</div></div>
|
||||
<div class=main>
|
||||
<p>Prerequisites: <code>python3</code>; <code>gcc</code> and/or <code>clang</code>. Currently tested only
|
||||
under Linux, but porting to other systems shouldn't be difficult.</p>
|
||||
<p>For sysadmins, to install in <code>/usr/local/{include,lib,bin}</code>:</p>
|
||||
<pre><code> ./configure && make -j8 install
|
||||
</code></pre>
|
||||
<p>For developers with an unprivileged account (typically with</p>
|
||||
<pre><code> export LD_LIBRARY_PATH="$HOME/lib"
|
||||
export LIBRARY_PATH="$HOME/lib"
|
||||
export CPATH="$HOME/include"
|
||||
export PATH="$HOME/bin:$PATH"
|
||||
</code></pre>
|
||||
<p>in <code>$HOME/.profile</code>), to install in <code>$HOME/{include,lib,bin}</code>:</p>
|
||||
<pre><code> ./configure --prefix=$HOME && make -j8 install
|
||||
</code></pre>
|
||||
<p>For distributors creating a package: Run</p>
|
||||
<pre><code> ./configure --prefix=/usr && make -j8
|
||||
</code></pre>
|
||||
<p>and then follow your usual packaging procedures for the
|
||||
<code>build/0/package</code> files:</p>
|
||||
<pre><code> build/0/package/man/man3/cpucycles.3
|
||||
build/0/package/include/cpucycles.h
|
||||
build/0/package/lib/libcpucycles*
|
||||
build/0/package/bin/cpucycles-info
|
||||
</code></pre>
|
||||
<p>There are some old systems where libcpucycles requires <code>-lrt</code> for
|
||||
<code>clock_gettime</code>; currently <code>libcpucycles.so</code> doesn't link to <code>-lrt</code>,
|
||||
so it's up to the caller to link to <code>-lrt</code>.</p>
|
||||
<p>More options: You can run</p>
|
||||
<pre><code> ./configure --host=amd64
|
||||
</code></pre>
|
||||
<p>to override <code>./configure</code>'s guess of the architecture that it should
|
||||
compile for. The architecture controls which cycle counters to try
|
||||
compiling: e.g., <code>amd64</code> tries compiling <code>cpucycles/amd64*</code> and
|
||||
<code>cpucycles/default*</code>.</p>
|
||||
<p>Inside the <code>build</code> directory, <code>0</code> is symlinked to <code>amd64</code> for
|
||||
<code>--host=amd64</code>. Running <code>make clean</code> removes <code>build/amd64</code>. Re-running
|
||||
<code>./configure</code> automatically starts with <code>make clean</code>.</p>
|
||||
<p>A subsequent <code>./configure --host=arm64</code> will create <code>build/arm64</code> and
|
||||
symlink <code>0 -> arm64</code>, without touching an existing <code>build/amd64</code>.
|
||||
However, cross-compilers aren't yet selected automatically.</p>
|
||||
<p>Compilers tried are listed in <code>compilers/default</code>. Each compiler
|
||||
includes <code>-fPIC</code> to create a shared library, <code>-fvisibility=hidden</code> to
|
||||
hide non-public symbols in the library, and <code>-fwrapv</code> to switch to a
|
||||
slightly less dangerous version of C. The first compiler that seems to
|
||||
work is used to compile everything.</p><hr><font size=1><b>Version:</b>
|
||||
This is version 2023.01.05 of the "Install" web page.
|
||||
</font>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,122 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style type="text/css">
|
||||
html{overflow-y:scroll}
|
||||
body{font-family:sans-serif}
|
||||
p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
|
||||
li p{font-size:1.0em}
|
||||
blockquote p{font-size:1.0em}
|
||||
tt{font-size:1.2em}
|
||||
code{font-size:1.2em}
|
||||
h1{font-size:1.5em}
|
||||
h2{font-size:1.3em}
|
||||
h3{font-size:1.0em}
|
||||
h1 a{text-decoration:none}
|
||||
table{border-collapse:collapse}
|
||||
th,td{border:1px solid black}
|
||||
table a{text-decoration:none}
|
||||
table tr{font-size:0.9em;line-height:1.6em}
|
||||
.links a:hover{text-decoration:underline}
|
||||
.links a:active{text-decoration:underline}
|
||||
.links img{width:200px;padding-left:1em}
|
||||
.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
|
||||
.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
|
||||
.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
|
||||
min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
|
||||
font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
|
||||
.here{border-bottom:0px;background-color:#ffffff}
|
||||
.away{background-color:#125d0d;}
|
||||
.away a{text-decoration:none;display:block;color:#ffffff}
|
||||
.away a:hover,.away a:active{text-decoration:underline}
|
||||
.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
|
||||
</style>
|
||||
<title>
|
||||
Security</title>
|
||||
</head>
|
||||
<body>
|
||||
<div class=headline>
|
||||
libcpucycles
|
||||
</div>
|
||||
<div class=nav>
|
||||
<div class="navt away"><a href=index.html>Intro</a>
|
||||
</div><div class="navt away"><a href=download.html>Download</a>
|
||||
</div><div class="navt away"><a href=install.html>Install</a>
|
||||
</div><div class="navt away"><a href=api.html>API</a>
|
||||
</div><div class="navt away"><a href=counters.html>Counters</a>
|
||||
</div><div class="navt away"><a href=selection.html>Selection</a>
|
||||
</div><div class="navt here">Security
|
||||
</div></div>
|
||||
<div class=main>
|
||||
<p>Many security systems have been shown to be breakable by "timing
|
||||
attacks". These attacks extract secrets by analyzing timings of the
|
||||
legitimate user's operations on secret data. See the June 2022 survey
|
||||
page <a href="https://timing.attacks.cr.yp.to">https://timing.attacks.cr.yp.to</a>
|
||||
for an overview and further references.</p>
|
||||
<p>Sometimes these attacks are used as motivation to disable the attacker's
|
||||
access to various timing mechanisms. For example, Firefox rounds its
|
||||
<code>performance.now</code> timer to 1-millisecond resolution
|
||||
<a href="https://developer.mozilla.org/en-US/docs/Web/API/Performance/now">"to mitigate potential security threats"</a>.</p>
|
||||
<p>As another example, reducing <code>/proc/sys/kernel/perf_event_paranoid</code>
|
||||
under Linux to 2 (from 3 or higher), so that libcpucycles has access to
|
||||
the best available Intel/AMD cycle counter (RDPMC), also means making
|
||||
this cycle counter and other performance-monitoring counters available
|
||||
to any attacker-controlled software running on the computer. Perhaps
|
||||
this helps timing attacks, not to mention the possibility of opening up
|
||||
other vulnerabilities via the complicated <code>perf_event</code> interface.</p>
|
||||
<p>As yet another example, ARM CPUs disable user access to the main CPU
|
||||
cycle counter by default. Installing a kernel module to enable user
|
||||
access to the cycle counter could help attacks.</p>
|
||||
<p>Given the availability of simple mechanisms to disable RDPMC etc., it is
|
||||
easy to recommend using those mechanisms. To avoid creating unnecessary
|
||||
tension between those recommendations and the use of libcpucycles,
|
||||
applications that use libcpucycles should be structured so that
|
||||
high-resolution timers are used only on controlled development and
|
||||
benchmarking machines, not on general end-user machines.</p>
|
||||
<p>This structure might seem incompatible with using cycle counts to
|
||||
automatically select the best of multiple options, as in FFTW. However,
|
||||
new infrastructure introduced in <a href="https://lib25519.cr.yp.to">lib25519</a>
|
||||
automatically selects options on end-user machines based on cycle counts
|
||||
that were <em>collected on benchmarking machines</em>.</p>
|
||||
<p>The above text should not be understood as endorsing the idea that
|
||||
disabling timers is an <em>effective</em> defense against timing attacks.
|
||||
Certainly disabling high-resolution timers is not sufficient for
|
||||
security: there are many ways for attackers to amplify timing signals
|
||||
and to statistically filter out noise from low-resolution timers.
|
||||
Disabling <em>every</em> standard timing mechanism on the machine does not stop
|
||||
the attacker from accessing a remote timer or a counter maintained by
|
||||
the attacker's software. Perhaps disabling timers sometimes makes the
|
||||
difference between a feasible attack and an infeasible attack, but
|
||||
evaluating this is extremely difficult.</p>
|
||||
<p>Meanwhile there is an auditable methodology available to stop timing
|
||||
attacks: constant-time programming, which systematically cuts off data
|
||||
flow from secrets to timings.</p>
|
||||
<p>For example, secrets affect a CPU's power consumption, and Turbo Boost
|
||||
creates data flow from power consumption to timings, as illustrated by
|
||||
the <a href="https://www.hertzbleed.com">Hertzbleed attack</a> extracting secret
|
||||
keys from the SIKE cryptosystem (before SIKE was broken in other ways),
|
||||
and an <a href="https://arxiv.org/abs/2206.07012">independent attack</a>
|
||||
extracting secret AES keys. Consequently, the constant-time methodology
|
||||
does not allow Turbo Boost.</p>
|
||||
<p>This is why <a href="https://timing.attacks.cr.yp.to">https://timing.attacks.cr.yp.to</a>
|
||||
recommends turning off Turbo Boost "right now", and explains the
|
||||
mechanisms available to do this. One non-security reason that it was
|
||||
already normal (although not universal) for manufacturers to provide
|
||||
these mechanisms to end users is that Turbo Boost has a reputation for
|
||||
causing premature hardware failures. Turbo Boost also provides very
|
||||
little speed benefit for modern multithreaded vectorized applications.</p>
|
||||
<p>Another reaction to timing attacks is to apply "masking" techniques.
|
||||
These techniques <em>seem</em> to make it more difficult for attackers to
|
||||
extract secrets from power consumption and other side channels. However,
|
||||
as <a href="https://timing.attacks.cr.yp.to">https://timing.attacks.cr.yp.to</a>
|
||||
explains, it is "practically impossible for an auditor to obtain any
|
||||
real assurance that these techniques are secure". See the December 2022
|
||||
paper
|
||||
<a href="https://eprint.iacr.org/2022/1713">"Breaking a fifth-order masked implementation of CRYSTALS-Kyber by copy-paste"</a>
|
||||
for a newer example of a security failure in a masked implementation.</p><hr><font size=1><b>Version:</b>
|
||||
This is version 2023.01.05 of the "Security" web page.
|
||||
</font>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,158 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style type="text/css">
|
||||
html{overflow-y:scroll}
|
||||
body{font-family:sans-serif}
|
||||
p,ul,ol,blockquote,pre{font-size:0.9em;line-height:1.6em}
|
||||
li p{font-size:1.0em}
|
||||
blockquote p{font-size:1.0em}
|
||||
tt{font-size:1.2em}
|
||||
code{font-size:1.2em}
|
||||
h1{font-size:1.5em}
|
||||
h2{font-size:1.3em}
|
||||
h3{font-size:1.0em}
|
||||
h1 a{text-decoration:none}
|
||||
table{border-collapse:collapse}
|
||||
th,td{border:1px solid black}
|
||||
table a{text-decoration:none}
|
||||
table tr{font-size:0.9em;line-height:1.6em}
|
||||
.links a:hover{text-decoration:underline}
|
||||
.links a:active{text-decoration:underline}
|
||||
.links img{width:200px;padding-left:1em}
|
||||
.links td{border:0px;padding-top:0.5em;padding-bottom:0.5em}
|
||||
.headline{padding:0;font-weight:bold;font-size:1.5em;vertical-align:top;padding-bottom:0.5em;color:#125d0d}
|
||||
.navt{display:inline-block;box-sizing:border-box;-moz-box-sizing:border-box;-webkit-box-sizing:border-box;
|
||||
min-width:14%;margin:0;padding:0;padding-left:0.5em;padding-right:0.5em;vertical-align:center;
|
||||
font-weight:bold;font-size:1.1em;text-align:center;border:1px solid black}
|
||||
.here{border-bottom:0px;background-color:#ffffff}
|
||||
.away{background-color:#125d0d;}
|
||||
.away a{text-decoration:none;display:block;color:#ffffff}
|
||||
.away a:hover,.away a:active{text-decoration:underline}
|
||||
.main{margin:0;padding-top:0em;padding-bottom:1%;clear:both}
|
||||
</style>
|
||||
<title>
|
||||
Selection</title>
|
||||
</head>
|
||||
<body>
|
||||
<div class=headline>
|
||||
libcpucycles
|
||||
</div>
|
||||
<div class=nav>
|
||||
<div class="navt away"><a href=index.html>Intro</a>
|
||||
</div><div class="navt away"><a href=download.html>Download</a>
|
||||
</div><div class="navt away"><a href=install.html>Install</a>
|
||||
</div><div class="navt away"><a href=api.html>API</a>
|
||||
</div><div class="navt away"><a href=counters.html>Counters</a>
|
||||
</div><div class="navt here">Selection
|
||||
</div><div class="navt away"><a href=security.html>Security</a>
|
||||
</div></div>
|
||||
<div class=main>
|
||||
<p>Here is how libcpucycles decides which cycle counter to use. The
|
||||
underlying principles are as follows:</p>
|
||||
<ul>
|
||||
<li>
|
||||
<p>Failure is not allowed. Using a low-resolution timer such as
|
||||
<code>gettimeofday()</code> to estimate cycle counts is not desirable but is better
|
||||
than providing no information.</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>A counter that does well on some CPUs and OSes can do badly on others.
|
||||
The counter selection in libcpucycles is based not just on rules set
|
||||
at compile time but also on measurements of how well the counters
|
||||
perform when the program first calls <code>cpucycles()</code>.</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>A critical application of cycle counting is collecting cycle counts
|
||||
for multiple options to see which option is faster. It is the caller's
|
||||
responsibility to compute medians of cycle counts for many runs of
|
||||
whatever is being benchmarked: medians filter out occasional
|
||||
cycle-count jumps caused by migration to another core (if the
|
||||
benchmark is not pinned to a single core) or interrupts from other OS
|
||||
activity. libcpucycles does not reject an otherwise attractive counter
|
||||
merely because of occasional jumps.</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>Cycle-counting overhead is not desirable, but does not directly affect
|
||||
comparisons of multiple options measured using the same cycle counter,
|
||||
so it is less important than consistent major errors such as treating
|
||||
2^32 + x cycles as x cycles. (Performance experts seeing a function
|
||||
that takes billions of cycles usually focus on smaller subroutines,
|
||||
but libcpucycles should not break larger measurements.) This is why
|
||||
libcpucycles does not provide direct access to 32-bit cycle counters:
|
||||
it provides wrappers that combine the counters with gettimeofday() to
|
||||
produce 64 bits, even though this incurs some extra overhead.</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>The noise introduced by typical off-core clocks, such as multiplying a
|
||||
24MHz clock by 86 to estimate cycles on a 2.064GHz CPU core, comes in
|
||||
small part from low resolution but much more from changes in CPU
|
||||
frequency: e.g., a 10000-cycle computation might be measured as 20000
|
||||
cycles when the CPU enters a power-saving mode. When libcpucycles has
|
||||
access to what is believed to be an on-core cycle counter, it uses
|
||||
that even when its measurements show some noise. (Choosing an on-core
|
||||
cycle counter does not magically eliminate the change in the relative
|
||||
speed of the CPU and DRAM; the usual advice to warm up the CPU and set
|
||||
constant frequencies if possible still applies.)</p>
|
||||
</li>
|
||||
</ul>
|
||||
<p>When <code>cpucycles()</code> is first called, libcpucycles tries running each
|
||||
cycle counter that has been compiled into the library. For example, for
|
||||
64-bit ARM CPUs, libcpucycles will try <code>arm64-pmc</code>, <code>arm64-vct</code>,
|
||||
<code>default-gettimeofday</code>, <code>default-mach</code>, <code>default-monotonic</code>, and
|
||||
<code>default-perfevent</code>, minus any of those that failed to compile.</p>
|
||||
<p>Cycle counters that fail at run time with SIGILL (or SIGFPE or SIGBUS or
|
||||
SIGSEGV) are eliminated from the list. For example, <code>arm64-pmc</code> will
|
||||
fail with SIGILL if the kernel does not allow user access to
|
||||
<code>PMCCNTR_EL0</code>. Beware that libcpucycles does not catch SIGILL after its
|
||||
initial tests: if the kernel initially allows user access to
|
||||
<code>PMCCNTR_EL0</code> but later turns it off then <code>arm64-pmc</code> will crash.</p>
|
||||
<p>Independently of these counters, libcpucycles uses various OS mechanisms
|
||||
to obtain an <em>estimate</em> of the CPU frequency. This estimate is also
|
||||
available to the caller as <code>cpucycles_persecond()</code>.</p>
|
||||
<p>The methods that libcpucycles uses to ask the OS for an estimated CPU
|
||||
frequency fail on some OS-CPU combinations, in which case libcpucycles
|
||||
falls back to a <code>cpucyclespersecond</code> environment variable, or, if that
|
||||
variable does not exist, an estimate of 2399987654 cycles per second.
|
||||
(This estimate is in a realistic range of CPU speeds, and is close to
|
||||
multiples of 24MHz, 25MHz, and 19.2MHz, which are common crystal
|
||||
frequencies.) The sysadmin can create <code>/etc/cpucyclespersecond</code> to
|
||||
override all of the OS mechanisms.</p>
|
||||
<p>For counters that do not ask for scaling, the estimated CPU frequency is
|
||||
shown in <code>cpucycles-info</code> as a double-check on the counter results. For
|
||||
counters that ask for scaling, libcpucycles uses the estimated CPU
|
||||
frequency to compute the scaling, so this is not a double-check. If a
|
||||
counter asks for scaling and the estimated CPU frequency does not seem
|
||||
close to a multiple of the counter frequency (possibly with a small
|
||||
power-of-2 denominator) then libcpucycles will throw the counter away,
|
||||
except in the case of fixed-resolution OS counters such as
|
||||
<code>gettimeofday</code> and <code>CLOCK_MONOTONIC</code>.</p>
|
||||
<p>libcpucycles computes a precision estimate for each counter (times any
|
||||
applicable scaling) as follows. Call the counter 1000 times. Check that
|
||||
the counter has never decreased, and has increased at least once. (A
|
||||
counter where the decrease/increase checks fail is retried 10 times, so
|
||||
10000 calls overall, and removed if it fails all 10 times.) The
|
||||
precision estimate is then the smallest nonzero difference between
|
||||
adjacent counter results, plus a penalty explained below.</p>
|
||||
<p>The penalty is 100 cycles for off-core counters (including RDTSC) and
|
||||
<code>default-perfevent</code>, and 200 cycles for fixed-resolution OS counters.
|
||||
For example, an on-core CPU cycle counter will be selected even if it
|
||||
actually has, e.g., a resolution of 8 cycles and 50 cycles of overhead.</p>
|
||||
<p>Finally, libcpucycles selects the counter where the precision estimate
|
||||
is the smallest number of cycles. Note that an inaccurate estimate of
|
||||
CPU frequency can influence the choice between a scaled counter and an
|
||||
unscaled counter.</p>
|
||||
<p>libcpucycles does <em>not</em> carry out its counter selection (typically tens
|
||||
of milliseconds, sometimes even more) as a static initializer; callers
|
||||
are presumed to not want to incur the cost of initialization unless and
|
||||
until they are actually using <code>cpucycles()</code>. A multithreaded caller thus
|
||||
has to place locks around any possibly-first call to <code>cpucycles()</code>, or
|
||||
create its own static initializer (an <code>__attribute__((constructor))</code>
|
||||
function) with an initial <code>cpucycles()</code> call so that all subsequent
|
||||
<code>cpucycles()</code> calls are thread-safe.</p><hr><font size=1><b>Version:</b>
|
||||
This is version 2023.01.05 of the "Selection" web page.
|
||||
</font>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,56 @@
|
||||
Prerequisites: `python3`; `gcc` and/or `clang`. Currently tested only
|
||||
under Linux, but porting to other systems shouldn't be difficult.
|
||||
|
||||
For sysadmins, to install in `/usr/local/{include,lib,bin}`:
|
||||
|
||||
./configure && make -j8 install
|
||||
|
||||
For developers with an unprivileged account (typically with
|
||||
|
||||
export LD_LIBRARY_PATH="$HOME/lib"
|
||||
export LIBRARY_PATH="$HOME/lib"
|
||||
export CPATH="$HOME/include"
|
||||
export PATH="$HOME/bin:$PATH"
|
||||
|
||||
in `$HOME/.profile`), to install in `$HOME/{include,lib,bin}`:
|
||||
|
||||
./configure --prefix=$HOME && make -j8 install
|
||||
|
||||
For distributors creating a package: Run
|
||||
|
||||
./configure --prefix=/usr && make -j8
|
||||
|
||||
and then follow your usual packaging procedures for the
|
||||
`build/0/package` files:
|
||||
|
||||
build/0/package/man/man3/cpucycles.3
|
||||
build/0/package/include/cpucycles.h
|
||||
build/0/package/lib/libcpucycles*
|
||||
build/0/package/bin/cpucycles-info
|
||||
|
||||
There are some old systems where libcpucycles requires `-lrt` for
|
||||
`clock_gettime`; currently `libcpucycles.so` doesn't link to `-lrt`,
|
||||
so it's up to the caller to link to `-lrt`.
|
||||
|
||||
More options: You can run
|
||||
|
||||
./configure --host=amd64
|
||||
|
||||
to override `./configure`'s guess of the architecture that it should
|
||||
compile for. The architecture controls which cycle counters to try
|
||||
compiling: e.g., `amd64` tries compiling `cpucycles/amd64*` and
|
||||
`cpucycles/default*`.
|
||||
|
||||
Inside the `build` directory, `0` is symlinked to `amd64` for
|
||||
`--host=amd64`. Running `make clean` removes `build/amd64`. Re-running
|
||||
`./configure` automatically starts with `make clean`.
|
||||
|
||||
A subsequent `./configure --host=arm64` will create `build/arm64` and
|
||||
symlink `0 -> arm64`, without touching an existing `build/amd64`.
|
||||
However, cross-compilers aren't yet selected automatically.
|
||||
|
||||
Compilers tried are listed in `compilers/default`. Each compiler
|
||||
includes `-fPIC` to create a shared library, `-fvisibility=hidden` to
|
||||
hide non-public symbols in the library, and `-fwrapv` to switch to a
|
||||
slightly less dangerous version of C. The first compiler that seems to
|
||||
work is used to compile everything.
|
||||
@@ -0,0 +1,57 @@
|
||||
.\" Automatically generated by Pandoc 2.9.2.1
|
||||
.\"
|
||||
.TH "cpucycles" "3" "" "" ""
|
||||
.hy
|
||||
.SS NAME
|
||||
.PP
|
||||
cpucycles - count CPU cycles
|
||||
.SS SYNOPSIS
|
||||
.IP
|
||||
.nf
|
||||
\f[C]
|
||||
#include <cpucycles.h>
|
||||
|
||||
long long count = cpucycles();
|
||||
long long persecond = cpucycles_persecond();
|
||||
const char *implementation = cpucycles_implementation();
|
||||
const char *version = cpucycles_version();
|
||||
\f[R]
|
||||
.fi
|
||||
.PP
|
||||
Link with \f[C]-lcpucycles\f[R].
|
||||
Old systems may also need \f[C]-lrt\f[R].
|
||||
.SS DESCRIPTION
|
||||
.PP
|
||||
\f[C]cpucycles()\f[R] returns an estimate for the number of CPU cycles
|
||||
that have occurred since an unspecified time in the past (perhaps system
|
||||
boot, perhaps program startup).
|
||||
.PP
|
||||
Accessing true cycle counters can be difficult on some CPUs and
|
||||
operating systems.
|
||||
\f[C]cpucycles()\f[R] does its best to produce accurate results, but
|
||||
selects a low-precision counter if the only other option is failure.
|
||||
.PP
|
||||
\f[C]cpucycles_persecond()\f[R] returns an estimate for the number of
|
||||
CPU cycles per second.
|
||||
This estimate comes from \f[C]/etc/cpucyclespersecond\f[R] if that file
|
||||
exists, otherwise from various OS mechanisms, otherwise from the
|
||||
\f[C]cpucyclespersecond\f[R] environment variable if that is set,
|
||||
otherwise 2399987654.
|
||||
.PP
|
||||
\f[C]cpucycles_implementation()\f[R] returns the name of the counter in
|
||||
use: e.g., \f[C]\[dq]amd64-pmc\[dq]\f[R].
|
||||
.PP
|
||||
\f[C]cpucycles_version()\f[R] returns the \f[C]libcpucycles\f[R] version
|
||||
number as a string: e.g., \f[C]\[dq]20230115\[dq]\f[R].
|
||||
Results of \f[C]cpucycles_implementation()\f[R] should be interpreted
|
||||
relative to \f[C]cpucycles_version()\f[R].
|
||||
.PP
|
||||
\f[C]cpucycles\f[R] is actually a function pointer.
|
||||
The first call to \f[C]cpucycles()\f[R] or
|
||||
\f[C]cpucycles_persecond()\f[R] or \f[C]cpucycles_implementation()\f[R]
|
||||
selects one of the available counters and updates the
|
||||
\f[C]cpucycles\f[R] pointer accordingly.
|
||||
Subsequent calls to \f[C]cpucycles()\f[R] are thread-safe.
|
||||
.SS SEE ALSO
|
||||
.PP
|
||||
\f[B]gettimeofday\f[R](2), \f[B]clock_gettime\f[R](2)
|
||||
@@ -0,0 +1,36 @@
|
||||
libcpucycles is a public-domain microlibrary for counting CPU cycles.
|
||||
Cycle counts are not as detailed as
|
||||
[Falk diagrams](https://gamozolabs.github.io/metrology/2019/08/19/sushi_roll.html)
|
||||
but are the most precise timers available to typical software; they are
|
||||
central tools used in understanding and improving software performance.
|
||||
|
||||
The libcpucycles [API](api.html) is simple: include `<cpucycles.h>`, call
|
||||
`cpucycles()` to receive a `long long` whenever desired, and link with
|
||||
`-lcpucycles`.
|
||||
|
||||
[Internally](counters.html), libcpucycles understands machine-level
|
||||
cycle counters for amd64 (both PMC and TSC), arm32, arm64 (both PMC and
|
||||
VCT), mips64, ppc32, ppc64, riscv32, riscv64, s390x, sparc64, and x86.
|
||||
libcpucycles also understands four OS-level mechanisms, which give
|
||||
varying levels of accuracy: `mach_absolute_time`, `perf_event`,
|
||||
`CLOCK_MONOTONIC`, and, as a fallback, microsecond-resolution
|
||||
`gettimeofday`.
|
||||
|
||||
When the program first calls `cpucycles()`, libcpucycles automatically
|
||||
benchmarks the available mechanisms and [selects](selection.html) the
|
||||
mechanism that does the best job. Subsequent `cpucycles()` calls are
|
||||
thread-safe and very fast. An accompanying `cpucycles-info` program
|
||||
prints a summary of cycle-counter accuracy.
|
||||
|
||||
For comparison, there is a simple-sounding `__rdtsc()` API provided by
|
||||
compilers, but this works only on Intel/AMD CPUs and is generally noisier
|
||||
than PMC. There is a `__builtin_readcyclecounter()` that works on more
|
||||
CPUs, but this works only with `clang` and has the same noise problems.
|
||||
Both of these mechanisms put the burden on the caller to figure out what
|
||||
can be done on other CPUs. Various packages include their own more
|
||||
portable abstraction layers for counting cycles (see, e.g., FFTW's
|
||||
[`cycle.h`](https://github.com/FFTW/fftw3/blob/master/kernel/cycle.h),
|
||||
used to automatically select from among multiple implementations
|
||||
provided by FFTW), but this creates per-package effort to keep up with
|
||||
the latest cycle counters. The goal of libcpucycles is to provide
|
||||
state-of-the-art cycle counting centrally for all packages to use.
|
||||
@@ -0,0 +1,76 @@
|
||||
Many security systems have been shown to be breakable by "timing
|
||||
attacks". These attacks extract secrets by analyzing timings of the
|
||||
legitimate user's operations on secret data. See the June 2022 survey
|
||||
page [https://timing.attacks.cr.yp.to](https://timing.attacks.cr.yp.to)
|
||||
for an overview and further references.
|
||||
|
||||
Sometimes these attacks are used as motivation to disable the attacker's
|
||||
access to various timing mechanisms. For example, Firefox rounds its
|
||||
`performance.now` timer to 1-millisecond resolution
|
||||
["to mitigate potential security threats"](https://developer.mozilla.org/en-US/docs/Web/API/Performance/now).
|
||||
|
||||
As another example, reducing `/proc/sys/kernel/perf_event_paranoid`
|
||||
under Linux to 2 (from 3 or higher), so that libcpucycles has access to
|
||||
the best available Intel/AMD cycle counter (RDPMC), also means making
|
||||
this cycle counter and other performance-monitoring counters available
|
||||
to any attacker-controlled software running on the computer. Perhaps
|
||||
this helps timing attacks, not to mention the possibility of opening up
|
||||
other vulnerabilities via the complicated `perf_event` interface.
|
||||
|
||||
As yet another example, ARM CPUs disable user access to the main CPU
|
||||
cycle counter by default. Installing a kernel module to enable user
|
||||
access to the cycle counter could help attacks.
|
||||
|
||||
Given the availability of simple mechanisms to disable RDPMC etc., it is
|
||||
easy to recommend using those mechanisms. To avoid creating unnecessary
|
||||
tension between those recommendations and the use of libcpucycles,
|
||||
applications that use libcpucycles should be structured so that
|
||||
high-resolution timers are used only on controlled development and
|
||||
benchmarking machines, not on general end-user machines.
|
||||
|
||||
This structure might seem incompatible with using cycle counts to
|
||||
automatically select the best of multiple options, as in FFTW. However,
|
||||
new infrastructure introduced in [lib25519](https://lib25519.cr.yp.to)
|
||||
automatically selects options on end-user machines based on cycle counts
|
||||
that were _collected on benchmarking machines_.
|
||||
|
||||
The above text should not be understood as endorsing the idea that
|
||||
disabling timers is an _effective_ defense against timing attacks.
|
||||
Certainly disabling high-resolution timers is not sufficient for
|
||||
security: there are many ways for attackers to amplify timing signals
|
||||
and to statistically filter out noise from low-resolution timers.
|
||||
Disabling _every_ standard timing mechanism on the machine does not stop
|
||||
the attacker from accessing a remote timer or a counter maintained by
|
||||
the attacker's software. Perhaps disabling timers sometimes makes the
|
||||
difference between a feasible attack and an infeasible attack, but
|
||||
evaluating this is extremely difficult.
|
||||
|
||||
Meanwhile there is an auditable methodology available to stop timing
|
||||
attacks: constant-time programming, which systematically cuts off data
|
||||
flow from secrets to timings.
|
||||
|
||||
For example, secrets affect a CPU's power consumption, and Turbo Boost
|
||||
creates data flow from power consumption to timings, as illustrated by
|
||||
the [Hertzbleed attack](https://www.hertzbleed.com) extracting secret
|
||||
keys from the SIKE cryptosystem (before SIKE was broken in other ways),
|
||||
and an [independent attack](https://arxiv.org/abs/2206.07012)
|
||||
extracting secret AES keys. Consequently, the constant-time methodology
|
||||
does not allow Turbo Boost.
|
||||
|
||||
This is why [https://timing.attacks.cr.yp.to](https://timing.attacks.cr.yp.to)
|
||||
recommends turning off Turbo Boost "right now", and explains the
|
||||
mechanisms available to do this. One non-security reason that it was
|
||||
already normal (although not universal) for manufacturers to provide
|
||||
these mechanisms to end users is that Turbo Boost has a reputation for
|
||||
causing premature hardware failures. Turbo Boost also provides very
|
||||
little speed benefit for modern multithreaded vectorized applications.
|
||||
|
||||
Another reaction to timing attacks is to apply "masking" techniques.
|
||||
These techniques _seem_ to make it more difficult for attackers to
|
||||
extract secrets from power consumption and other side channels. However,
|
||||
as [https://timing.attacks.cr.yp.to](https://timing.attacks.cr.yp.to)
|
||||
explains, it is "practically impossible for an auditor to obtain any
|
||||
real assurance that these techniques are secure". See the December 2022
|
||||
paper
|
||||
["Breaking a fifth-order masked implementation of CRYSTALS-Kyber by copy-paste"](https://eprint.iacr.org/2022/1713)
|
||||
for a newer example of a security failure in a masked implementation.
|
||||
@@ -0,0 +1,104 @@
|
||||
Here is how libcpucycles decides which cycle counter to use. The
|
||||
underlying principles are as follows:
|
||||
|
||||
* Failure is not allowed. Using a low-resolution timer such as
|
||||
`gettimeofday()` to estimate cycle counts is not desirable but is better
|
||||
than providing no information.
|
||||
|
||||
* A counter that does well on some CPUs and OSes can do badly on others.
|
||||
The counter selection in libcpucycles is based not just on rules set
|
||||
at compile time but also on measurements of how well the counters
|
||||
perform when the program first calls `cpucycles()`.
|
||||
|
||||
* A critical application of cycle counting is collecting cycle counts
|
||||
for multiple options to see which option is faster. It is the caller's
|
||||
responsibility to compute medians of cycle counts for many runs of
|
||||
whatever is being benchmarked: medians filter out occasional
|
||||
cycle-count jumps caused by migration to another core (if the
|
||||
benchmark is not pinned to a single core) or interrupts from other OS
|
||||
activity. libcpucycles does not reject an otherwise attractive counter
|
||||
merely because of occasional jumps.
|
||||
|
||||
* Cycle-counting overhead is not desirable, but does not directly affect
|
||||
comparisons of multiple options measured using the same cycle counter,
|
||||
so it is less important than consistent major errors such as treating
|
||||
2^32 + x cycles as x cycles. (Performance experts seeing a function
|
||||
that takes billions of cycles usually focus on smaller subroutines,
|
||||
but libcpucycles should not break larger measurements.) This is why
|
||||
libcpucycles does not provide direct access to 32-bit cycle counters:
|
||||
it provides wrappers that combine the counters with gettimeofday() to
|
||||
produce 64 bits, even though this incurs some extra overhead.
|
||||
|
||||
* The noise introduced by typical off-core clocks, such as multiplying a
|
||||
24MHz clock by 86 to estimate cycles on a 2.064GHz CPU core, comes in
|
||||
small part from low resolution but much more from changes in CPU
|
||||
frequency: e.g., a 10000-cycle computation might be measured as 20000
|
||||
cycles when the CPU enters a power-saving mode. When libcpucycles has
|
||||
access to what is believed to be an on-core cycle counter, it uses
|
||||
that even when its measurements show some noise. (Choosing an on-core
|
||||
cycle counter does not magically eliminate the change in the relative
|
||||
speed of the CPU and DRAM; the usual advice to warm up the CPU and set
|
||||
constant frequencies if possible still applies.)
|
||||
|
||||
When `cpucycles()` is first called, libcpucycles tries running each
|
||||
cycle counter that has been compiled into the library. For example, for
|
||||
64-bit ARM CPUs, libcpucycles will try `arm64-pmc`, `arm64-vct`,
|
||||
`default-gettimeofday`, `default-mach`, `default-monotonic`, and
|
||||
`default-perfevent`, minus any of those that failed to compile.
|
||||
|
||||
Cycle counters that fail at run time with SIGILL (or SIGFPE or SIGBUS or
|
||||
SIGSEGV) are eliminated from the list. For example, `arm64-pmc` will
|
||||
fail with SIGILL if the kernel does not allow user access to
|
||||
`PMCCNTR_EL0`. Beware that libcpucycles does not catch SIGILL after its
|
||||
initial tests: if the kernel initially allows user access to
|
||||
`PMCCNTR_EL0` but later turns it off then `arm64-pmc` will crash.
|
||||
|
||||
Independently of these counters, libcpucycles uses various OS mechanisms
|
||||
to obtain an _estimate_ of the CPU frequency. This estimate is also
|
||||
available to the caller as `cpucycles_persecond()`.
|
||||
|
||||
The methods that libcpucycles uses to ask the OS for an estimated CPU
|
||||
frequency fail on some OS-CPU combinations, in which case libcpucycles
|
||||
falls back to a `cpucyclespersecond` environment variable, or, if that
|
||||
variable does not exist, an estimate of 2399987654 cycles per second.
|
||||
(This estimate is in a realistic range of CPU speeds, and is close to
|
||||
multiples of 24MHz, 25MHz, and 19.2MHz, which are common crystal
|
||||
frequencies.) The sysadmin can create `/etc/cpucyclespersecond` to
|
||||
override all of the OS mechanisms.
|
||||
|
||||
For counters that do not ask for scaling, the estimated CPU frequency is
|
||||
shown in `cpucycles-info` as a double-check on the counter results. For
|
||||
counters that ask for scaling, libcpucycles uses the estimated CPU
|
||||
frequency to compute the scaling, so this is not a double-check. If a
|
||||
counter asks for scaling and the estimated CPU frequency does not seem
|
||||
close to a multiple of the counter frequency (possibly with a small
|
||||
power-of-2 denominator) then libcpucycles will throw the counter away,
|
||||
except in the case of fixed-resolution OS counters such as
|
||||
`gettimeofday` and `CLOCK_MONOTONIC`.
|
||||
|
||||
libcpucycles computes a precision estimate for each counter (times any
|
||||
applicable scaling) as follows. Call the counter 1000 times. Check that
|
||||
the counter has never decreased, and has increased at least once. (A
|
||||
counter where the decrease/increase checks fail is retried 10 times, so
|
||||
10000 calls overall, and removed if it fails all 10 times.) The
|
||||
precision estimate is then the smallest nonzero difference between
|
||||
adjacent counter results, plus a penalty explained below.
|
||||
|
||||
The penalty is 100 cycles for off-core counters (including RDTSC) and
|
||||
`default-perfevent`, and 200 cycles for fixed-resolution OS counters.
|
||||
For example, an on-core CPU cycle counter will be selected even if it
|
||||
actually has, e.g., a resolution of 8 cycles and 50 cycles of overhead.
|
||||
|
||||
Finally, libcpucycles selects the counter where the precision estimate
|
||||
is the smallest number of cycles. Note that an inaccurate estimate of
|
||||
CPU frequency can influence the choice between a scaled counter and an
|
||||
unscaled counter.
|
||||
|
||||
libcpucycles does _not_ carry out its counter selection (typically tens
|
||||
of milliseconds, sometimes even more) as a static initializer; callers
|
||||
are presumed to not want to incur the cost of initialization unless and
|
||||
until they are actually using `cpucycles()`. A multithreaded caller thus
|
||||
has to place locks around any possibly-first call to `cpucycles()`, or
|
||||
create its own static initializer (an `__attribute__((constructor))`
|
||||
function) with an initial `cpucycles()` call so that all subsequent
|
||||
`cpucycles()` calls are thread-safe.
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
prefix = sys.argv[1]
|
||||
dirs = 'man/man3','lib','include','bin'
|
||||
install = {}
|
||||
|
||||
os.umask(0o22)
|
||||
|
||||
for target in dirs:
|
||||
install[target] = '%s/%s'%(prefix,target)
|
||||
os.makedirs(install[target],exist_ok=True)
|
||||
|
||||
os.umask(0o77)
|
||||
|
||||
for target in dirs:
|
||||
with tempfile.TemporaryDirectory(dir=install[target]) as t:
|
||||
for fn in sorted(os.listdir('package/'+target)):
|
||||
try:
|
||||
shutil.copy2('package/%s/%s' % (target,fn),'%s/%s' % (t,fn),follow_symlinks=False)
|
||||
except TypeError: # XXX: old python3; should copy symlinks manually
|
||||
shutil.copy2('package/%s/%s' % (target,fn),'%s/%s' % (t,fn))
|
||||
os.rename('%s/%s' % (t,fn),'%s/%s' % (install[target],fn))
|
||||
+6
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
rm -f package/lib/libcpucycles.a
|
||||
ar cr package/lib/libcpucycles.a "$@"
|
||||
ranlib package/lib/libcpucycles.a || :
|
||||
chmod 644 package/lib/libcpucycles.a
|
||||
@@ -0,0 +1 @@
|
||||
20230115
|
||||
@@ -0,0 +1,9 @@
|
||||
#[link(name = "cpucycles", kind = "static")]
|
||||
extern "C" {
|
||||
pub static mut cpucycles:
|
||||
::std::option::Option<unsafe extern "C" fn() -> ::std::os::raw::c_longlong>;
|
||||
pub fn cpucycles_implementation() -> *const ::std::os::raw::c_char;
|
||||
pub fn cpucycles_version() -> *const ::std::os::raw::c_char;
|
||||
pub fn cpucycles_persecond() -> ::std::os::raw::c_longlong;
|
||||
pub fn cpucycles_tracesetup();
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
mod bindings;
|
||||
use bindings as c;
|
||||
|
||||
use std::fmt;
|
||||
use std::{
|
||||
error::Error,
|
||||
ffi::{CStr, CString, IntoStringError},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CpuCyclesError {
|
||||
message: String,
|
||||
}
|
||||
|
||||
impl fmt::Display for CpuCyclesError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{}", self.message)
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for CpuCyclesError {}
|
||||
|
||||
pub fn cpucycles_tracesetup() {
|
||||
unsafe { c::cpucycles_tracesetup() }
|
||||
}
|
||||
|
||||
pub fn cpucycles() -> Result<i64, CpuCyclesError> {
|
||||
if let Some(count) = unsafe { c::cpucycles.map(|f| f()) } {
|
||||
Ok(count)
|
||||
} else {
|
||||
Err(CpuCyclesError {
|
||||
message: "Could not execute cpucycles!".to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cpucycles_persecond() -> Result<i64, CpuCyclesError> {
|
||||
Ok(unsafe { c::cpucycles_persecond() })
|
||||
}
|
||||
|
||||
pub fn cpucycles_implementation() -> Result<String, IntoStringError> {
|
||||
let implementation = unsafe { CString::from(CStr::from_ptr(c::cpucycles_implementation())) };
|
||||
implementation.into_string()
|
||||
}
|
||||
|
||||
pub fn cpucycles_version() -> Result<String, IntoStringError> {
|
||||
let version = unsafe { CString::from(CStr::from_ptr(c::cpucycles_version())) };
|
||||
version.into_string()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use crate::*;
|
||||
|
||||
#[test]
|
||||
fn cpucycles_test() {
|
||||
let count = cpucycles();
|
||||
assert!(count.is_ok())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cpucycles_persecond_test() {
|
||||
let per_second = cpucycles_persecond();
|
||||
assert!(per_second.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cpucycles_implementation_test() {
|
||||
let implementation = cpucycles_implementation();
|
||||
assert!(implementation.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cpucycles_version_test() {
|
||||
let version = cpucycles_version();
|
||||
assert!(version.is_ok());
|
||||
}
|
||||
}
|
||||
+18
-13
@@ -29,32 +29,37 @@ log = { workspace = true }
|
||||
pretty_env_logger = "0.4.0"
|
||||
rand = "0.7.3"
|
||||
rocket = { version = "0.5.0-rc.2", features = ["json"] }
|
||||
serde = { version="1.0", features = ["derive"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
sysinfo = "0.27.7"
|
||||
tokio = { version="1.21.2", features = ["rt-multi-thread", "net", "signal"] }
|
||||
tokio-util = { version="0.7.3", features = ["codec"] }
|
||||
tokio = { version = "1.21.2", features = ["rt-multi-thread", "net", "signal"] }
|
||||
tokio-util = { version = "0.7.3", features = ["codec"] }
|
||||
toml = "0.5.8"
|
||||
url = { version = "2.2", features = ["serde"] }
|
||||
atty = "0.2"
|
||||
|
||||
## internal
|
||||
nym-config = { path="../common/config" }
|
||||
nym-crypto = { path="../common/crypto" }
|
||||
nym-config = { path = "../common/config" }
|
||||
nym-crypto = { path = "../common/crypto" }
|
||||
nym-contracts-common = { path = "../common/cosmwasm-smart-contracts/contracts-common" }
|
||||
mixnet-client = { path="../common/client-libs/mixnet-client" }
|
||||
mixnode-common = { path="../common/mixnode-common" }
|
||||
nym-nonexhaustive-delayqueue = { path="../common/nonexhaustive-delayqueue" }
|
||||
nym-sphinx = { path="../common/nymsphinx" }
|
||||
mixnet-client = { path = "../common/client-libs/mixnet-client" }
|
||||
mixnode-common = { path = "../common/mixnode-common" }
|
||||
nym-nonexhaustive-delayqueue = { path = "../common/nonexhaustive-delayqueue" }
|
||||
nym-sphinx = { path = "../common/nymsphinx" }
|
||||
nym-pemstore = { path = "../common/pemstore", version = "0.2.0" }
|
||||
nym-task = { path = "../common/task" }
|
||||
nym-types = { path = "../common/types" }
|
||||
nym-topology = { path="../common/topology" }
|
||||
validator-client = { path="../common/client-libs/validator-client" }
|
||||
nym-bin-common = { path="../common/bin-common" }
|
||||
nym-topology = { path = "../common/topology" }
|
||||
validator-client = { path = "../common/client-libs/validator-client" }
|
||||
nym-bin-common = { path = "../common/bin-common" }
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { version="1.21.2", features = ["rt-multi-thread", "net", "signal", "test-util"] }
|
||||
tokio = { version = "1.21.2", features = [
|
||||
"rt-multi-thread",
|
||||
"net",
|
||||
"signal",
|
||||
"test-util",
|
||||
] }
|
||||
|
||||
nym-sphinx-types = { path = "../common/nymsphinx/types" }
|
||||
nym-sphinx-params = { path = "../common/nymsphinx/params" }
|
||||
|
||||
Reference in New Issue
Block a user