Compare commits

...

1 Commits

Author SHA1 Message Date
durch e8d8bfc55f DJB lib25519 Rust wrapper 2023-06-20 17:19:09 +02:00
1695 changed files with 337040 additions and 1 deletions
+1 -1
View File
@@ -102,7 +102,7 @@ default-members = [
"explorer-api",
]
exclude = ["explorer", "contracts", "clients/webassembly", "nym-wallet", "nym-connect/mobile/src-tauri", "nym-connect/desktop", "cpu-cycles"]
exclude = ["explorer", "contracts", "clients/webassembly", "nym-wallet", "nym-connect/mobile/src-tauri", "nym-connect/desktop", "cpu-cycles", "lib-25519"]
[workspace.package]
authors = ["Nym Technologies SA"]
+23
View File
@@ -0,0 +1,23 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "lib_25519"
version = "0.1.0"
dependencies = [
"cfg-if",
"libc",
]
[[package]]
name = "libc"
version = "0.2.146"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b"
+12
View File
@@ -0,0 +1,12 @@
[package]
name = "lib_25519"
version = "0.1.0"
edition = "2021"
build = "build.rs"
links = "25519"
[dependencies]
libc = "0.2.140"
[build-dependencies]
cfg-if = "1"
+69
View File
@@ -0,0 +1,69 @@
use std::{env, path::PathBuf, process::Command};
fn main() {
let out_dir = env::var("OUT_DIR").unwrap();
let out_path = PathBuf::from(&out_dir);
let source_path = PathBuf::from("lib25519")
.canonicalize()
.expect("cannot canonicalize path");
cfg_if::cfg_if! {
if #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "mips", target_arch = "powerpc", target_arch = "powerpc64", target_arch = "arm")))] {
panic!("Unsupported architecture - {}!", env::var("CARGO_CFG_TARGET_ARCH").unwrap(), )
}
};
let mut compile_o_command = Command::new("./configure");
let compile_o_command = compile_o_command
.current_dir(&source_path)
.arg(format!("--prefix={out_dir}"));
match compile_o_command.output() {
Ok(output) => {
if !output.status.success() {
panic!("{:?}", unsafe {
std::str::from_utf8_unchecked(&output.stderr)
})
}
}
Err(e) => panic!("{e}"),
}
let mut compile_o_command = Command::new("make");
let compile_o_command = compile_o_command
.current_dir(&source_path)
.arg("-j8")
.arg("install");
match compile_o_command.output() {
Ok(output) => {
if !output.status.success() {
panic!("{:?}", unsafe {
std::str::from_utf8_unchecked(&output.stderr)
})
}
}
Err(e) => panic!("{e}"),
}
println!(
"cargo:rustc-link-search=native={}",
out_path.join("lib").to_str().unwrap()
);
println!("cargo:rustc-link-lib=static=25519");
println!("cargo:rustc-link-lib=static=randombytes_kernel");
let mut compile_o_command = Command::new("make");
let compile_o_command = compile_o_command.current_dir(source_path).arg("clean");
match compile_o_command.output() {
Ok(output) => {
if !output.status.success() {
panic!("{:?}", unsafe {
std::str::from_utf8_unchecked(&output.stderr)
})
}
}
Err(e) => panic!("{e}"),
}
}
+9
View File
@@ -0,0 +1,9 @@
Files: *
Copyright: 2022 Kaushik Nath
Copyright: 2011-2022 Daniel J. Bernstein
Copyright: 2015 Tung Chou
Copyright: 2011-2014 Peter Schwabe
Copyright: 2011 Niels Duif
Copyright: 2011 Tanja Lange
Copyright: 2011 Bo-Yin Yang
License: CC0-1.0
+297
View File
@@ -0,0 +1,297 @@
lib25519 draws on many previous implementations listed below, plus new
speedups from Kaushik Nath and new infrastructure work and factoring
from Daniel J. Bernstein. All software is in the public domain. Since
some organizations require licenses, lib25519 is also CC0-licensed.
Some code was originally copied from public-domain code in the SUPERCOP
benchmarking framework. See https://bench.cr.yp.to/supercop.html for
SUPERCOP releases. The following small changes from code available in
SUPERCOP are made in lib25519 without further comment:
* Returning void rather than int for functions that never fail in
lib25519.
* Message lengths long long rather than unsigned long long.
* Defining various constants in .c files (to simplify PIC handling)
instead of .S files.
* Moving some C files to shared-*.c (which in lib25519 means that
these files are compiled by only one compiler).
* Using CRYPTO_SHARED_NAMESPACE rather than CRYPTO_NAMESPACE for
symbols defined in *.S and shared-*.c.
* Adding various "linker define" and "linker use" lines.
Larger changes from code in SUPERCOP, such as new code divisions across
lib25519 primitives, are indicated below.
Sources of Curve25519 software (this is not a comprehensive list, just
the software that lib25519 is derived from):
* Daniel J. Bernstein. "Curve25519: new Diffie-Hellman speed
records." Pages 207228 in Public key cryptography—PKC 2006, 9th
international conference on theory and practice in public-key
cryptography, New York, NY, USA, April 2426, 2006, proceedings,
edited by Moti Yung, Yevgeniy Dodis, Aggelos Kiayias, Tal Malkin,
Lecture Notes in Computer Science 3958, Springer, 2006, ISBN
3-540-33851-9.
This is the source of the Curve25519 design, the X25519 design, and
various speedups. Most of the software from that paper is specific
to a variety of 32-bit platforms (radix 2^25.5 or radix 2^21.25),
but the portable supercop/crypto_scalarmult/curve25519/ref10 (radix
2^25.5) is derived from this.
lib25519/crypto_nP/montgomery25519/ref10 starts with
supercop/crypto_scalarmult/curve25519/ref10, and tweaks the API to
provide crypto_nP instead of crypto_scalarmult. Inversion is
factored out, producing crypto_pow/inv25519/ref10. The trivial
crypto_scalarmult_base wrapper is factored out, producing
crypto_nG/montgomery25519/ref/base.c; lib25519 has faster nG
functions, but intentionally provides ref for situations where
speed is outweighed by simplicity, assurance, code size, etc.
* supercop/crypto_scalarmult/curve25519/donna_c64 (radix 2^51) from
Adam Langley.
lib25519/crypto_nP/montgomery25519/donna_c64 starts from this and
tweaks the API to provide crypto_nP instead of crypto_scalarmult
(and removes crypto_scalarmult_base). crypto_pow/inv25519/donna_c64
is factored out.
* Daniel J. Bernstein, Niels Duif, Tanja Lange, Peter Schwabe, Bo-Yin
Yang. "High-speed high-security signatures." Pages 124142 in
Cryptographic hardware and embedded systems—CHES 2011, 13th
international workshop, Nara, Japan, September 28October 1, 2011,
proceedings, edited by Bart Preneel, Tsuyoshi Takagi, Lecture Notes
in Computer Science 6917, Springer, 2011, ISBN 978-3-642-23950-2.
Journal version: Journal of Cryptographic Engineering 2 (2012),
7789.
This is the source of the Ed25519 design and various X25519/Ed25519
speedups for 64-bit Intel/AMD platforms, in particular producing
supercop/crypto_{scalarmult/curve,sign/ed}25519/amd64-{51,64}*
(radix 2^51 and radix 2^64 respectively). Peter Schwabe led the
implementation work.
lib25519/crypto_nP/montgomery25519/amd64-51 starts from
supercop/crypto_scalarmult/curve25519/amd64-51 and tweaks the API
to provide crypto_nP instead of crypto_scalarmult (and removes
crypto_scalarmult_base). crypto_nG/merged25519/amd64-51 (for
fixed-base-point multiplication), crypto_mGnP/ed25519/amd64-51 (for
double-scalar multiplication), and crypto_sign/ed25519/amd64 (for
the remaining signing components) factor
supercop/crypto_sign/ed25519/amd64-51 into smaller pieces.
crypto_pow/inv25519/amd64-51 is also factored out. "SMALLTABLES"
support is removed. Support for batch verification is removed,
although it could reappear in a subsequent lib25519 release.
Similar comments apply to amd64-64 and ref10. A compiler warning
is eliminated (window size 64 in amd64-64-24k/sc25519.h).
* Tung Chou. "Sandy2x: New Curve25519 Speed Records." SAC 2015.
This is the source of various X25519 speedups using 256-bit vector
instructions, specifically AVX vector instructions in Intel's Sandy
Bridge, in particular producing
supercop/crypto_scalarmult/curve25519/sandy2x (radix 2^25.5).
lib25519/crypto_{nP,nG}/montgomery25519/sandy2x start from
supercop/crypto_scalarmult/curve25519/sandy2x, and tweak the API to
provide crypto_nP and crypto_nG instead of crypto_scalarmult and
crypto_scalarmult_base respectively. The top bit of the incoming
point is cleared. crypto_pow/inv25519/sandy2x is factored out.
* Kaushik Nath and Palash Sarkar, "Efficient arithmetic in
(pseudo-)Mersenne prime order fields", Advances in Mathematics of
Communications 16 (2022), pages 303348.
Original release:
https://github.com/kn-cs/pmp-farith/tree/master/gf-p2-255-19/SL
https://github.com/kn-cs/pmp-farith/tree/master/gf-p2-255-19/USL1
The "SL" software is the source of various speedups to the amd64-64
software, producing the "maa4" versions of fe25519_mul.S,
fe25519_square.S, and fe25519_nsquare.S. These .S files are used
inside the following lib25519 directories:
crypto_mGnP/ed25519/amd64-avx2-10l-maa4
crypto_mGnP/ed25519/amd64-avx2-9l-maa4
crypto_mGnP/ed25519/amd64-maa4
crypto_nG/merged25519/amd64-avx2-10l-maa4
crypto_nG/merged25519/amd64-avx2-9l-maa4
crypto_nG/merged25519/amd64-maa4
crypto_nP/montgomery25519/amd64-avx2-hey10l-maa4
crypto_nP/montgomery25519/amd64-avx2-hey9l-maa4
crypto_nP/montgomery25519/amd64-avx2-ns10l-maa4
crypto_nP/montgomery25519/amd64-avx2-ns9l-maa4
crypto_nP/montgomery25519/amd64-maa4
crypto_pow/inv25519/amd64-maa4
The "USL" software is the source of various speedups to the
amd64-51 software, producing the "maa5" versions of fe25519_mul.S
and fe25519_nsquare.S. These .S files are used inside the following
lib25519 directories:
crypto_nP/montgomery25519/amd64-avx2-hey10l-maa5
crypto_nP/montgomery25519/amd64-avx2-hey9l-maa5
crypto_nP/montgomery25519/amd64-avx2-ns10l-maa5
crypto_nP/montgomery25519/amd64-avx2-ns9l-maa5
crypto_pow/inv25519/amd64-maa5
* Kaushik Nath and Palash Sarkar, "Security and efficiency trade-offs
for elliptic curve Diffie-Hellman at the 128-bit and 224-bit
security levels." J. Cryptogr. Eng. 12(1): 107-121 (2022).
Original release:
https://github.com/kn-cs/x25519/tree/master/intel64-mxaa-4limb
https://github.com/kn-cs/x25519
This "mxaa-4limb" software is the source of various speedups to
"maa4" on CPUs supporting BMI2 instructions (e.g., Intel Haswell
from 2013), in particular producing the "mxaa" versions of
fe25519_mul.S and fe25519_nsquare.S. These .S files are used inside
the following lib25519 directories:
crypto_mGnP/ed25519/amd64-avx2-10l-mxaa
crypto_mGnP/ed25519/amd64-avx2-9l-mxaa
crypto_mGnP/ed25519/amd64-mxaa
crypto_nG/merged25519/amd64-avx2-10l-mxaa
crypto_nG/merged25519/amd64-avx2-9l-mxaa
crypto_nG/merged25519/amd64-mxaa
crypto_nP/montgomery25519/amd64-avx2-hey10l-mxaa
crypto_nP/montgomery25519/amd64-avx2-hey9l-mxaa
crypto_nP/montgomery25519/amd64-avx2-ns10l-mxaa
crypto_nP/montgomery25519/amd64-avx2-ns9l-mxaa
crypto_nP/montgomery25519/amd64-mxaa
crypto_pow/inv25519/amd64-mxaa
This software is also the source of the following three different
Montgomery-ladder functions, where the third also builds on the
"maax" work listed below:
crypto_nP/montgomery25519/amd64-maa4/mladder.S
crypto_nP/montgomery25519/amd64-mxaa/mladder.S
crypto_nP/montgomery25519/amd64-maax/mladder.S
* Kaushik Nath and Palash Sarkar, "Efficient arithmetic in
(pseudo-)Mersenne prime order fields", Advances in Mathematics of
Communications 16 (2022), pages 303348. Original release:
https://github.com/kn-cs/pmp-farith/tree/master/gf-p2-255-19/SLDCC
This is the source of various speedups to "mxaa" on CPUs that also
support ADX instructions (e.g., Intel Broadwell from 2014), in
particular producing the "maax" versions of fe25519_mul.S,
fe25519_square.S, and fe25519_nsquare.S. These .S files are used
inside the following lib25519 directories:
crypto_mGnP/ed25519/amd64-avx2-10l-maax
crypto_mGnP/ed25519/amd64-avx2-9l-maax
crypto_mGnP/ed25519/amd64-avx512ifma-5l-maax
crypto_mGnP/ed25519/amd64-maax
crypto_nG/merged25519/amd64-avx2-10l-maax
crypto_nG/merged25519/amd64-avx2-9l-maax
crypto_nG/merged25519/amd64-avx512ifma-5l-maax
crypto_nG/merged25519/amd64-maax
crypto_nP/montgomery25519/amd64-avx2-hey10l-maax
crypto_nP/montgomery25519/amd64-avx2-hey9l-maax
crypto_nP/montgomery25519/amd64-avx2-ns10l-maax
crypto_nP/montgomery25519/amd64-avx2-ns9l-maax
crypto_nP/montgomery25519/amd64-avx512-hey10l-maax
crypto_nP/montgomery25519/amd64-avx512-hey9l-maax
crypto_nP/montgomery25519/amd64-avx512-ns10l-maax
crypto_nP/montgomery25519/amd64-avx512-ns9l-maax
crypto_nP/montgomery25519/amd64-avx512ifma-hey5l-maax
crypto_nP/montgomery25519/amd64-avx512ifma-ns5l-maax
crypto_nP/montgomery25519/amd64-maax
crypto_pow/inv25519/amd64-maax
* Kaushik Nath and Palash Sarkar, "Efficient 4-Way Vectorizations of
the Montgomery Ladder". IEEE Trans. Computers 71(3): 712-723
(2022). Original release:
https://github.com/kn-cs/vec-ladder/tree/master/Curve25519
This is the source of the "hey10l" (radix 2^25.5), "hey9l" (radix
2^29), "ns10l" (radix 2^25.5), and "ns9l" (radix 2^29) versions of
mladder.S for CPUs that also support 256-bit AVX2 instructions
(e.g., Intel Haswell from 2013). In lib25519, these four .S files
are used in 16 directories:
crypto_nP/montgomery25519/amd64-avx2-hey10l-{maa4,maa5,maax,mxaa}
crypto_nP/montgomery25519/amd64-avx2-hey9l-{maa4,maa5,maax,mxaa}
crypto_nP/montgomery25519/amd64-avx2-ns10l-{maa4,maa5,maax,mxaa}
crypto_nP/montgomery25519/amd64-avx2-ns9l-{maa4,maa5,maax,mxaa}
* Kaushik Nath, new Montgomery-ladder code new in lib25519 (no paper
yet) for CPUs supporting AVX-512 instructions (e.g., Intel
Skylake-X from 2017). These are six files in lib25519:
crypto_nP/montgomery25519/amd64-avx512-hey10l-maax
crypto_nP/montgomery25519/amd64-avx512-hey9l-maax
crypto_nP/montgomery25519/amd64-avx512-ns10l-maax
crypto_nP/montgomery25519/amd64-avx512-ns9l-maax
crypto_nP/montgomery25519/amd64-avx512ifma-hey5l-maax
crypto_nP/montgomery25519/amd64-avx512ifma-ns5l-maax
* Kaushik Nath, nine versions of fixed-base-point
scalar-multiplication code new in lib25519 (no paper yet) for
various platforms:
crypto_nG/merged25519/amd64-avx2-10l-maa4/ge25519_base.S
crypto_nG/merged25519/amd64-avx2-10l-maax/ge25519_base.S
crypto_nG/merged25519/amd64-avx2-10l-mxaa/ge25519_base.S
crypto_nG/merged25519/amd64-avx2-9l-maa4/ge25519_base.S
crypto_nG/merged25519/amd64-avx2-9l-maax/ge25519_base.S
crypto_nG/merged25519/amd64-avx2-9l-mxaa/ge25519_base.S
crypto_nG/merged25519/amd64-avx512ifma-5l-maax/ge25519_base.S
crypto_nG/merged25519/amd64-maa4/ge25519_base.S
crypto_nG/merged25519/amd64-maax/ge25519_base.S
crypto_nG/merged25519/amd64-mxaa/ge25519_base.S
* Kaushik Nath, ten versions of double-scalar-multiplication code new
in lib25519 (no paper yet) for various platforms. Each version has
precompute.S and process.S:
crypto_mGnP/ed25519/amd64-avx2-10l-maa4/ge25519_double_scalarmult_precompute.S
crypto_mGnP/ed25519/amd64-avx2-10l-maax/ge25519_double_scalarmult_precompute.S
crypto_mGnP/ed25519/amd64-avx2-10l-mxaa/ge25519_double_scalarmult_precompute.S
crypto_mGnP/ed25519/amd64-avx2-9l-maa4/ge25519_double_scalarmult_precompute.S
crypto_mGnP/ed25519/amd64-avx2-9l-maax/ge25519_double_scalarmult_precompute.S
crypto_mGnP/ed25519/amd64-avx2-9l-mxaa/ge25519_double_scalarmult_precompute.S
crypto_mGnP/ed25519/amd64-avx512ifma-5l-maax/ge25519_double_scalarmult_precompute.S
crypto_mGnP/ed25519/amd64-maa4/ge25519_double_scalarmult_precompute.S
crypto_mGnP/ed25519/amd64-maax/ge25519_double_scalarmult_precompute.S
crypto_mGnP/ed25519/amd64-mxaa/ge25519_double_scalarmult_precompute.S
crypto_mGnP/ed25519/amd64-avx2-10l-maa4/ge25519_double_scalarmult_process.S
crypto_mGnP/ed25519/amd64-avx2-10l-maax/ge25519_double_scalarmult_process.S
crypto_mGnP/ed25519/amd64-avx2-10l-mxaa/ge25519_double_scalarmult_process.S
crypto_mGnP/ed25519/amd64-avx2-9l-maa4/ge25519_double_scalarmult_process.S
crypto_mGnP/ed25519/amd64-avx2-9l-maax/ge25519_double_scalarmult_process.S
crypto_mGnP/ed25519/amd64-avx2-9l-mxaa/ge25519_double_scalarmult_process.S
crypto_mGnP/ed25519/amd64-avx512ifma-5l-maax/ge25519_double_scalarmult_process.S
crypto_mGnP/ed25519/amd64-maa4/ge25519_double_scalarmult_process.S
crypto_mGnP/ed25519/amd64-maax/ge25519_double_scalarmult_process.S
crypto_mGnP/ed25519/amd64-mxaa/ge25519_double_scalarmult_process.S
Almost all of the crypto_pow/inv25519 implementations use exponentiation,
but there is also a different implementation from the following source:
* Daniel J. Bernstein, Bo-Yin Yang. "Fast constant-time gcd
computation and modular inversion." IACR Transactions on
Cryptographic Hardware and Embedded Systems 2019 issue 3 (2019),
340398.
This is the source of the "safegcd" algorithm and software. Further
speedups (no paper yet; ideas from Peter Dettman, Gregory Maxwell,
and Pieter Wuille) have produced the "inverse25519skylake" software
available here: https://gcd.cr.yp.to/software.html
lib25519/crypto_pow/inv25519/amd64-safegcd is inverse25519skylake,
tweaked to provide the crypto_pow API and to clear the top bit of
the input.
For lower-layer SHA-512 functions:
* Daniel J. Bernstein, supercop/crypto_hash*/sha512/*. In lib25519,
some unused variables are removed in crypto_hashblocks/sha512/avx
to eliminate compiler warnings.
Most of the lib25519 infrastructure, such as the run-time implementation
selector automatically guided by CPU type and previous benchmarks, is
new in lib25519 from Daniel J. Bernstein. Portions of autogen-speed
(generating lib25519-speed.c) and autogen-test (generating
lib25519-test.c) are based on benchmarking software and test software in
SUPERCOP by Daniel J. Bernstein. The symmetric-cryptography code for
generating pseudorandom test inputs and hashing test outputs is adapted
from TweetNaCl, a library by Daniel J. Bernstein, Wesley Janssen, Tanja
Lange, and Peter Schwabe.
+53
View File
@@ -0,0 +1,53 @@
Prerequisites: python3; gcc and/or clang.
For sysadmins, to install in /usr/local/{include,lib,bin}:
./configure && make -j8 install
For users, to install in $HOME/{include,lib,bin}:
./configure --prefix=$HOME && make -j8 install
For distributors creating a package: Run
./configure --prefix=/usr && make -j8
and then follow your usual packaging procedures for the .h files in
build/0/package/include, the libraries in build/0/package/lib, and the
test programs in build/0/package/bin.
The long-term plan is to split off some components of lib25519 into
their own packages, and distributors may already wish to package these
components accordingly:
* -lcpucycles will be its own package.
* -lrandombytes_kernel will be its own package.
* -lrandombytes will be an indirection layer providing randombytes.h
and supporting an ecosystem of randombytes() implementations (via,
e.g., Debian's /etc/alternatives), such as -lrandombytes_kernel,
-lrandombytes_per_process_rng, etc.
* -l25519 will be the main lib25519 package, using -lrandombytes and
-lcpucycles.
lib25519-test already uses -l25519 without -lrandombytes: it substitutes
its own knownrandombytes() to generate test vectors.
More options: You can run
./configure --host=amd64
to override ./configure's guess of the architecture that it should
compile for. The architecture controls which implementations to try
(see crypto_*/*/*/architectures) and which compilers to try (see
compilers/*).
Inside the build directory, 0 is symlinked to amd64 for --host=amd64.
Running "make clean" removes build/amd64. Re-running ./configure
automatically starts with "make clean".
A subsequent ./configure --host=arm will create build/arm and symlink
0 -> arm, without touching an existing build/amd64. However,
cross-compilers aren't yet selected automatically.
+405
View File
@@ -0,0 +1,405 @@
This file explains the internal structure of lib25519, and explains how
to add new instruction sets and new implementations.
## Primitives
The directories `crypto_*/*` inside lib25519 define the following
primitives (see also `autogen-test` for Python versions of the
mathematical primitives):
* `crypto_verify/32`: `crypto_verify_32(s,t)` returns 0 when the 32-byte
arrays `s` and `t` are equal, otherwise `-1`. This function takes
constant time.
* `crypto_hashblocks/sha512`: `crypto_hashblocks_sha512(h,x,xlen)`
updates an intermediate SHA-512 hash `h` using all of the full
128-byte blocks at the beginning of the `xlen`-byte array `x`, and
returns the number of bytes left over, namely `xlen` mod 128. This
function takes time that depends on `xlen` but not on the contents of
`h` or `x`.
* `crypto_hash/sha512`: `crypto_hash_sha512(h,x,xlen)` computes the
SHA-512 hash `h` of the `xlen`-byte array `x`. This function takes
time that depends on `xlen` but not on the contents of `x`.
* `crypto_pow/inv25519`: `crypto_pow_inv25519(y,x)` computes the
2^255^21 power `y` of an integer `x` modulo 2^255^19. This is the
same as the inverse of `x` modulo 2^255^19 if `x` is not divisible by
2^255^19. The integers `x` and `y` are represented as a 32-byte array
in little-endian form. This function takes constant time.
This function guarantees that the output `y` is frozen modulo
2^255^19, i.e., completely reduced to the range 0,1,...,2^255^20. The
caller is expected to freeze `x` before calling this function. The
function accepts `x` in the range {0,1,...,2^256^1} while ignoring the
top bit (the coefficient of 2^255^ in binary): i.e., the function
reduces `x` modulo 2^255^ and then modulo 2^255^19.
* `crypto_nP/montgomery25519`: `crypto_nP_montgomery25519(nP,n,P)`
computes the X25519 function: in short, if a Curve25519 point has
x-coordinate `P` then the `n`th multiple of the point has x-coordinate
`nP`. The inputs and outputs are represented as 32-byte arrays in
little-endian form. This function takes constant time.
X25519 is defined for `n` in the range 2^254^ + 8{0,1,2,3,...,2^251^1}.
`crypto_nP_montgomery25519` allows `n` in the wider range
{0,1,...,2^256^1}, and in all cases computes `m`th multiples where `m`
is defined as follows: make a copy of `n`, clear the top bit, set the
next bit, and clear the bottom three bits.
X25519 guarantees that the output `nP` is frozen. It does not require
the input to be frozen; also, it allows the input to be on the twist,
and to have small order.
`crypto_nP_montgomery25519` clears the top bit of `P` before applying
the X25519 function. Callers that want the X25519 function on `P` with
the top bit set have to reduce modulo 2^255^19 for themselves.
* `crypto_nG/merged25519`: `crypto_nG_merged25519(nG,n)` reads an
integer `n` in the range {0,1,...,2^256^1} and outputs a frozen
integer `nG` modulo 2^255^19, possibly with the top bit set (i.e.,
adding 2^255^) as described below. Both `n` and `nG` are represented
as 32-byte arrays in little-endian form. This function takes constant
time.
If the top bit of `n` is clear then `nG` is the Edwards y-coordinate
of the `n`th multiple of G, and the top bit is set exactly when the
Edwards x-coordinate is odd. Otherwise `nG` is the Montgomery
x-coordinate of the (`n`2^255^)th multiple of G, and the top bit is
clear. Here G is the standard Curve25519 base point, which has
Montgomery x-coordinate 9, Edwards y-coordinate 4/5, and even Edwards
x-coordinate.
* `crypto_nG/montgomery25519`: `crypto_nG_montgomery25519(nG,n)` is
the same as `crypto_nP_montgomery(nG,n,G)` where `G` is the array
{9,0,0,...,0}. This function takes constant time.
The point of `crypto_nG` is to save time (using a small table
precomputed from `G`) compared to the more general `crypto_nP`. This
has the disadvantage of being more complicated, which is particularly
important given that lib25519 has not yet been verified, and in any
case increases code size noticeably for X25519. There is a `ref`
implementation of `crypto_nG` that simply calls `crypto_nP`, and
setting sticky bits on the other implementation directories
(`chmod +t crypto_nG/montgomery25519/*; chmod -t crypto_nG/montgomery25519/ref`)
will force lib25519 to use `ref`.
* `crypto_mGnP/ed25519`: `crypto_mGnP_ed25519(mGnP,m,n,P)` computes
`(m mod L)G(n mod L)P` in Edwards coordinates, where `L` is the prime
number 2^252^+27742317777372353535851937790883648493 and `G` is the
same standard base point. This function takes time that depends on the
inputs.
The input `m` is an integer in the range {0,1,...,2^256^1}
represented as a 32-byte array in little-endian form. Any `m` outside
the range {0,1,...,L1} triggers a failure, which is reported as
described below.
The input `n` is an integer in the range {0,1,...,2^512^1}
represented as a 64-byte array in little-endian form.
The input point `P` is represented as a 32-byte array as follows: the
(frozen) Edwards y-coordinate of `P` in {0,1,...,2^255^20} is stored
in little-endian form, and then the top bit is set exactly when the
(frozen) Edwards x-coordinate of `P` is odd. An input 32-byte array
that does not have this form is instead interpreted as the point `P`
with Edwards coordinates (...8,26), and triggers a failure, reported
as described below.
The output is a 33-byte array. The first 32 bytes are the output point
`(m mod L)G(n mod L)P`, represented the same way as `P`. The last
byte is 1 on success and 0 on failure.
* `crypto_dh/x25519`: `crypto_dh_x25519_keypair(pk,sk)` generates a
32-byte X25519 public key `pk` and the corresponding 32-byte secret
key `sk`. This function is the composition of `randombytes` to
generate `sk` and `crypto_nG_montgomery25519` to generate `pk`.
`crypto_dh_x25519(k,pk,sk)` generates a 32-byte shared secret `k`
given a public key `pk` and a secret key `sk`. This function is the
same as `crypto_nP_montgomery25519`.
* `crypto_sign/ed25519`: `crypto_sign_ed25519_keypair(pk,sk)` generates
a 32-byte Ed25519 public key `pk` and the corresponding 64-byte secret
key `sk`. This function takes constant time.
`crypto_sign_ed25519(sm,&smlen,m,mlen,sk)` generates an `smlen`-byte
signed message `sm` given an `mlen`-byte message `m` and a secret key
`sk`. The caller is required to allocate `mlen+64` bytes for `sm`. The
function always sets `smlen` to `mlen+64`. This function takes time
that depends on `mlen` but not on the other inputs.
`crypto_sign_ed25519_open(m,&mlen,sm,smlen,pk)` generates an
`mlen`-byte message `m` given an `smlen`-byte signed message `sm` and
a public key `pk`, and returns 0. However, if `sm` is invalid, this
function returns `-1`, sets `mlen` to `-1`, and clears `m`. The caller is
required to allocate `smlen` (not just `smlen-64`) bytes for `m`, for
example using the same array for `sm` and `m`. This function takes time
that depends on its inputs.
lib25519 includes a command-line utility `lib25519-test` that runs some
tests for each of these primitives, and another utility `lib25519-speed`
that measures cycle counts for each of these primitives.
The stable lib25519 API functions are built from the above primitives:
* `lib25519_dh_keypair` is `crypto_dh_x25519_keypair`.
* `lib25519_dh` is `crypto_dh_x25519`.
* `lib25519_sign_keypair` is `crypto_sign_ed25519_keypair`.
* `lib25519_sign` is `crypto_sign_ed25519`.
* `lib25519_sign_open` is `crypto_sign_ed25519_open`.
Some changes are anticipated in the list of primitives, but these API
functions will remain stable.
As in SUPERCOP and NaCl, message lengths intentionally use `long long`,
not `size_t`. In lib25519, message lengths are signed.
## Implementations
A single primitive can, and usually does, have multiple implementations.
Each implementation is in its own subdirectory. The implementations are
required to have exactly the same input-output behavior, and to some
extent this is tested, although it is not yet formally verified.
Different implementations typically offer different tradeoffs between
portability, simplicity, and efficiency. For example,
`crypto_nP/montgomery25519/ref10` is portable;
`crypto_nP/montgomery25519/amd64-maax` is faster and less portable.
Each unportable implementation has an `architectures` file. Each line in
this file identifies a CPU instruction set (and ABI) where the
implementation works. For example,
`crypto_nP/montgomery25519/amd64-maax/architectures` has one line
`amd64 bmi2 adx`, meaning that the implementation works on CPUs that
have the Intel/AMD 64-bit instruction set with the BMI2 and ADX
instruction-set extensions. The top-level `compilers` directory shows
(among other things) the allowed instruction-set names such as `bmi2`.
At run time, lib25519 checks the CPU where it is running, and selects
an implementation where `architectures` is compatible with that CPU.
Each primitive makes its own selection once per program startup, using
the compiler's `ifunc` mechanism. This type of run-time selection means,
for example, that an `amd64` CPU without AVX2 can share binaries with an
`amd64` CPU with AVX2. However, correctness requires instruction sets to
be preserved by migration across cores via the OS kernel, VM migration,
etc.
The compiler has a `target` mechanism that makes an `ifunc` selection
based on CPU architectures. Instead of using the `target` mechanism,
lib25519 uses a more sophisticated mechanism that also accounts for
benchmarks collected in advance of compilation.
## Compilers
lib25519 tries different C compilers for each implementation. For
example, `compilers/default` lists the following compilers:
gcc -Wall -fPIC -fwrapv -O2
clang -Wall -fPIC -fwrapv -Qunused-arguments -O2
Sometimes `gcc` produces better code, and sometimes `clang` produces
better code.
As another example, `compilers/amd64+avx` lists the following compilers:
gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mtune=sandybridge
clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mtune=sandybridge
The `-mavx` option tells these compilers that they are free to use the
AVX instruction-set extension.
Code compiled using the compilers in `compilers/amd64+avx` will be
considered at run time by the lib25519 selection mechanism if the
`supports()` function in `compilers/amd64+avx.c` returns nonzero. This
function checks whether the run-time CPU supports AVX (and SSE and so on,
and OSXSAVE with XMM/YMM being saved;
[https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85100](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85100)
says that all versions of gcc until 2018 handled this incorrectly in
`target`). Similar comments apply to other `compilers/*` files.
If some compilers fail (for example, clang is not installed, or the
compiler version is too old to support the compiler options used in
lib25519), the lib25519 compilation process will try its best to produce
a working library using the remaining compilers, even if this means
lower performance.
## Trimming
By default, to reduce size of the compiled library, the lib25519
compilation process trims the library down to the implementations that
are selected by lib25519's selection mechanism (across all CPUs; the
library remains portable, not tied to the compilation CPU).
This trimming is handled at link time rather than compile time to
increase the chance that, even if some implementations are broken by
compiler "upgrades", the library will continue to build successfully.
To avoid this trimming, pass the `--notrim` option to `./configure`.
All implementations that compile are then included in the library,
tested by `lib2519-test`, and measured by `lib25519-speed`. You'll want
to avoid trimming if you're adding new instruction sets or new
implementations (see below), so that you can run tests and benchmarks of
code that isn't selected yet.
## How to recompile after changes
If you make changes in the lib25519 source directory, you have to run
`./configure` again to repopulate the build directory. Simply running
`make` again doesn't suffice.
By default, `./configure` cleans the build directory first, so `make`
will recompile everything. This can be on the scale of seconds if you
have enough cores, but maybe you're developing on a slower machine. Two
options are currently available to accelerate the edit-compile cycle:
* There is an experimental `--noclean` option to `./configure` that,
for some simple types of changes, can produce a successful build
without cleaning.
* You can disable the implementations you're not using by setting
sticky bits on the source directories for those implementations:
e.g., `chmod +t crypto_nG/*/*avx2*`.
Make sure to reenable all implementations and do a full clean build if
you're collecting data to add to the source `benchmarks` directory.
## How to add new instruction sets
Adding another file `compilers/amd64+foo`, along with a `supports()`
implementation in `compilers/amd64+foo.c`, will support a new
instruction set. Do not assume that the new `foo` instruction set
implies support for older instruction sets (the idea of "levels" of
instruction sets); instead make sure to include the older instruction
sets in `+` tags, as illustrated by
`compilers/amd64+avx+bmi2+avx2+adx+avx512f+avx512vl+avx512ifma`.
In the compiler options, always make sure to include `-fPIC` to support
shared libraries, and `-fwrapv` to switch to a slightly less dangerous
version of C.
The `foo` tags don't have to be instruction sets. For example, if a CPU
has the same instruction set but wants different optimizations because
of differences in instruction timings, you can make a tag for those
optimizations, using, e.g., CPU IDs or benchmarks in the corresponding
`supports()` function to decide whether to enable those optimizations.
Benchmarks tend to be more future-proof than a list of CPU IDs, but the
time taken for benchmarks at program startup has to be weighed against
the subsequent speedup from the resulting optimizations.
To see how well lib25519 performs with the new compilers, run
`lib25519-speed` on the target machine and look for the `foo` lines in
the output. If the new performance is better than the performance shown
on the `selected` lines:
* Copy the `lib25519-speed` output into a file on the `benchmarks`
directory, typically named after the hostname of the target
machine.
* Run `./prioritize` in the top-level directory to create `priority`
files. These files tell lib25519 which implementations to select
for any given architecture.
* Reconfigure (again with `--notrim`), recompile, rerun
`lib25519-test`, and rerun `lib25519-speed` to check that the
`default` lines now use the `foo` compiler.
If the `foo` implementation is outperformed by other implementations,
then these steps don't help except for documenting this fact. The same
implementation might turn out to be useful for subsequent `foo` CPUs.
## How to add new implementations
Taking full advantage of the `foo` instruction set usually requires
writing new implementations. Sometimes there are also ideas for taking
better advantage of existing instruction sets.
Structurally, adding a new implementation of a primitive is a simple
matter of adding a new subdirectory with the code for that
implementation. Most of the work is optimizing the use of `foo`
intrinsics in `.c` files or `foo` instructions in `.S` files. Make sure
to include an `architectures` file saying, e.g., `amd64 avx2 foo`.
Names of implementation directories can use letters, digits, dashes, and
underscores. Do not use two implementation names that are the same when
dashes and underscores are removed.
All `.c` and `.S` files in the implementation directory are compiled and
linked. There is no need to edit a separate list of these files. You can
also use `.h` files via the C preprocessor.
If an implementation is actually more restrictive than indicated in
`architectures` then the resulting compiled library will fail on some
machines (although perhaps that implementation will not be used by
default). Putting unnecessary restrictions into `architectures` will not
create such failures, but can unnecessarily limit performance.
Some, but not all, mistakes in `architectures` will produce warnings
from the `checkinsns` script that runs automatically when lib25519 is
compiled. Running the `lib25519-test` program tries all implementations,
but only on the CPU where `lib25519-test` is being run, and `lib25519-test`
does not guarantee code coverage: for example, other message lengths
being signed could involve other code paths.
`amd64` implies little-endian, and implies architectural support for
unaligned loads and stores. Beware, however, that the Intel/AMD
vectorized `load`/`store` intrinsics (and the underlying `movdqa`
instruction) require alignment; if in doubt, use `loadu`/`storeu` (and
`movdqu`). The `lib25519-test` program checks unaligned inputs and
outputs, but can miss issues with unaligned stack variables.
To test your implementation, compile everything, check for compiler
warnings and errors, run `lib25519-test` (or just `lib25519-test nG` to
test a `crypto_nG` implementation), and check for a line saying `all
tests succeeded`. To use AddressSanitizer (for catching, at run time,
buffer overflows in C code), add `-fsanitize=address` to the `gcc` and
`clang` lines in `compilers/*`.
To see the performance of your implementation, run `lib25519-speed`.
If the new performance is better than the performance shown on the
`default` lines, follow the same steps as for a new instruction set:
copy the `lib25519-speed` output into a file on the `benchmarks`
directory; run `./prioritize` in the top-level directory to create
`priority` files; reconfigure (again with `--notrim`); recompile; rerun
`lib25519-test`; rerun `lib25519-speed`; check that the `default` lines
now use the new implementation.
## How to handle namespacing
As in SUPERCOP and NaCl, to call `crypto_hash_sha512()`, you have to
include `crypto_hash_sha512.h`; but to write an implementation of
`crypto_hash_sha512()`, you have to instead include `crypto_hash.h` and
define `crypto_hash`. Similar comments apply to other primitives.
The function name that's actually linked might end up as, e.g.,
`lib25519_hash_sha512_blocksplusavx_C2_hash` where `blocksplusavx`
indicates the implementation and `C2` indicates the compiler. Don't try
to build this name into your implementation.
If you have another global symbol `x` (for example, a non-`static`
function in a `.c` file, or a non-`static` variable outside functions in
a `.c` file), you have to replace it with `CRYPTO_NAMESPACE(x)`, for
example with `#define x CRYPTO_NAMESPACE(x)`.
For global symbols in `.S` files and `shared-*.c` files, use
`CRYPTO_SHARED_NAMESPACE` instead of `CRYPTO_NAMESPACE`. For `.S` files
that define both `x` and `_x` to handle platforms where `x` in C is `_x`
in assembly, use `CRYPTO_SHARED_NAMESPACE(x)` and
`_CRYPTO_SHARED_NAMESPACE(x)`; `CRYPTO_SHARED_NAMESPACE(_x)` is not
sufficient.
lib25519 includes a mechanism to recognize files that are copied across
implementations (possibly of different primitives) and to unify those
into a file compiled only once, reducing the overall size of the
compiled library and possibly improving cache utilization. To request
this mechanism, include a line `// linker define x` for any global
symbol `x` defined in the file, and a line `// linker use x` for any
global symbol `x` used in the file from the same implementation (not
`crypto_*` subroutines that you're calling, `randombytes`, etc.). This
mechanism tries very hard, perhaps too hard, to avoid improperly
unifying files: for example, even a slight difference in a `.h` file
included by a file defining a used symbol will disable the mechanism.
Typical namespacing mistakes will produce either linker failures or
warnings from the `checknamespace` script that runs automatically when
lib25519 is compiled.
+121
View File
@@ -0,0 +1,121 @@
Creative Commons Legal Code
CC0 1.0 Universal
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
HEREUNDER.
Statement of Purpose
The laws of most jurisdictions throughout the world automatically confer
exclusive Copyright and Related Rights (defined below) upon the creator
and subsequent owner(s) (each and all, an "owner") of an original work of
authorship and/or a database (each, a "Work").
Certain owners wish to permanently relinquish those rights to a Work for
the purpose of contributing to a commons of creative, cultural and
scientific works ("Commons") that the public can reliably and without fear
of later claims of infringement build upon, modify, incorporate in other
works, reuse and redistribute as freely as possible in any form whatsoever
and for any purposes, including without limitation commercial purposes.
These owners may contribute to the Commons to promote the ideal of a free
culture and the further production of creative, cultural and scientific
works, or to gain reputation or greater distribution for their Work in
part through the use and efforts of others.
For these and/or other purposes and motivations, and without any
expectation of additional consideration or compensation, the person
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
is an owner of Copyright and Related Rights in the Work, voluntarily
elects to apply CC0 to the Work and publicly distribute the Work under its
terms, with knowledge of his or her Copyright and Related Rights in the
Work and the meaning and intended legal effect of CC0 on those rights.
1. Copyright and Related Rights. A Work made available under CC0 may be
protected by copyright and related or neighboring rights ("Copyright and
Related Rights"). Copyright and Related Rights include, but are not
limited to, the following:
i. the right to reproduce, adapt, distribute, perform, display,
communicate, and translate a Work;
ii. moral rights retained by the original author(s) and/or performer(s);
iii. publicity and privacy rights pertaining to a person's image or
likeness depicted in a Work;
iv. rights protecting against unfair competition in regards to a Work,
subject to the limitations in paragraph 4(a), below;
v. rights protecting the extraction, dissemination, use and reuse of data
in a Work;
vi. database rights (such as those arising under Directive 96/9/EC of the
European Parliament and of the Council of 11 March 1996 on the legal
protection of databases, and under any national implementation
thereof, including any amended or successor version of such
directive); and
vii. other similar, equivalent or corresponding rights throughout the
world based on applicable law or treaty, and any national
implementations thereof.
2. Waiver. To the greatest extent permitted by, but not in contravention
of, applicable law, Affirmer hereby overtly, fully, permanently,
irrevocably and unconditionally waives, abandons, and surrenders all of
Affirmer's Copyright and Related Rights and associated claims and causes
of action, whether now known or unknown (including existing as well as
future claims and causes of action), in the Work (i) in all territories
worldwide, (ii) for the maximum duration provided by applicable law or
treaty (including future time extensions), (iii) in any current or future
medium and for any number of copies, and (iv) for any purpose whatsoever,
including without limitation commercial, advertising or promotional
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
member of the public at large and to the detriment of Affirmer's heirs and
successors, fully intending that such Waiver shall not be subject to
revocation, rescission, cancellation, termination, or any other legal or
equitable action to disrupt the quiet enjoyment of the Work by the public
as contemplated by Affirmer's express Statement of Purpose.
3. Public License Fallback. Should any part of the Waiver for any reason
be judged legally invalid or ineffective under applicable law, then the
Waiver shall be preserved to the maximum extent permitted taking into
account Affirmer's express Statement of Purpose. In addition, to the
extent the Waiver is so judged Affirmer hereby grants to each affected
person a royalty-free, non transferable, non sublicensable, non exclusive,
irrevocable and unconditional license to exercise Affirmer's Copyright and
Related Rights in the Work (i) in all territories worldwide, (ii) for the
maximum duration provided by applicable law or treaty (including future
time extensions), (iii) in any current or future medium and for any number
of copies, and (iv) for any purpose whatsoever, including without
limitation commercial, advertising or promotional purposes (the
"License"). The License shall be deemed effective as of the date CC0 was
applied by Affirmer to the Work. Should any part of the License for any
reason be judged legally invalid or ineffective under applicable law, such
partial invalidity or ineffectiveness shall not invalidate the remainder
of the License, and in such case Affirmer hereby affirms that he or she
will not (i) exercise any of his or her remaining Copyright and Related
Rights in the Work or (ii) assert any associated claims and causes of
action with respect to the Work, in either case contrary to Affirmer's
express Statement of Purpose.
4. Limitations and Disclaimers.
a. No trademark or patent rights held by Affirmer are waived, abandoned,
surrendered, licensed or otherwise affected by this document.
b. Affirmer offers the Work as-is and makes no representations or
warranties of any kind concerning the Work, express, implied,
statutory or otherwise, including without limitation warranties of
title, merchantability, fitness for a particular purpose, non
infringement, or the absence of latent or other defects, accuracy, or
the present or absence of errors, whether or not discoverable, all to
the greatest extent permissible under applicable law.
c. Affirmer disclaims responsibility for clearing rights of other persons
that may apply to the Work or any use thereof, including without
limitation any person's Copyright and Related Rights in the Work.
Further, Affirmer disclaims responsibility for obtaining any necessary
consents, permissions or other rights required for any use of the
Work.
d. Affirmer understands and acknowledges that Creative Commons is not a
party to this document and has no duty or obligation with respect to
this CC0 or use of the Work.
+8
View File
@@ -0,0 +1,8 @@
default:
cd build && $(MAKE)
install:
cd build && $(MAKE) install
clean:
cd build && $(MAKE) clean
+28
View File
@@ -0,0 +1,28 @@
lib25519 is a microlibrary for the X25519 encryption system and the
Ed25519 signature system, both of which use the Curve25519 elliptic
curve. Curve25519 is the fastest curve in TLS 1.3, and the only curve in
Wireguard, Signal, and many other applications (see Nicolai Brown's page
https://ianix.com/pub/curve25519-deployment.html).
lib25519 has a very simple stateless API based on the SUPERCOP API, with
wire-format inputs and outputs, providing functions that directly match
the central cryptographic operations in X25519 and Ed25519:
lib25519_dh_keypair(pk,sk): X25519 key generation
lib25519_dh(k,pk,sk): shared-secret generation
lib25519_sign_keypair(pk,sk): Ed25519 key generation
lib25519_sign(sm,&smlen,m,mlen,sk): signing
lib25519_sign_open(m,&mlen,sm,smlen,pk): verification + message recovery
Internally, lib25519 includes implementations designed for performance
on various CPUs, implementations designed to work portably across CPUs,
and automatic run-time selection of implementations.
lib25519 is intended to be called by larger multi-function libraries,
including libraries in other languages via FFI. The idea is that
lib25519 will take responsibility for the details of X25519/Ed25519
computation, including optimization, timing-attack protection, and
eventually verification, freeing up the calling libraries to concentrate
on application-specific needs such as protocol integration. Applications
can also call lib25519 directly.
+87
View File
@@ -0,0 +1,87 @@
Security model: X25519 is designed to be strong as a component of
various well-known "hashed DH" applications, and in particular is
designed to make the CDH problem difficult with respect to the standard
base. Ed25519 is designed to provide EUF-CMA, the standard notion of
unforgeability of a signature system under chosen-message attacks.
However, some applications need other security notions that are not
provided by X25519 and Ed25519.
Security level: X25519 and Ed25519 are more difficult to break by any
known attacks than a typical 128-bit cipher. They have an extremely
stable security track record, with two decades of research changing
security levels by only a fraction of a bit. They also proactively avoid
various potential risks. However, large quantum computers will break
both X25519 and Ed25519.
Software verification: lib25519 is intended to become a central target
for verification of full functional correctness of implementations of
X25519 and Ed25519. However, only certain portions have been verified so
far, and at this point the code should be presumed to have critical
bugs.
Timing attacks: lib25519 is designed to avoid all data flow from secret
data to memory addresses and branch conditions. Fully protecting the
user against timing attacks requires addressing more issues, such as the
following:
* Other common instructions used by lib25519 take variable time on
some CPUs. In particular, there are some embedded CPUs with
variable-time multipliers.
* Many CPUs include dynamic frequency-selection mechanisms such as
Turbo Boost, exposing power information via timing information.
Fortunately, these CPUs are normally shipped with simple options to
disable Turbo Boost etc., closing this leak; unfortunately, Turbo
Boost is enabled by default on CPUs that support it.
* Cryptographic keys are normally handled by cryptographic software,
but other user secrets are handled by many different pieces of
software.
See https://timing.attacks.cr.yp.to for a timing-attack survey and many
references.
Speculative-execution attacks: Some countermeasures against
speculative-execution attacks are planned but are not included in the
current version of lib25519. Full protection again requires addressing
issues at other system layers.
Further side-channel attacks: Even if all legitimate user sensors are
successfully kept isolated from attackers, attackers can set up their
own power sensors, electromagnetic sensors, acoustic sensors, etc.
Keeping cryptographic operations physically separated from sensors tends
to make such attacks much more expensive but is often infeasible.
"Masking" cryptographic computations seems to help and can be
affordable, although the security of masking is difficult to evaluate
and there are many broken masked implementations. Currently lib25519
does not include any masked implementations, so presumably it is easily
breakable by power attacks in environments where attackers can see power
consumption.
Further attacks: lib25519 creates an Ed25519 signing nonce as a hash of
the message, a long-term secret, and new randomness (specifically, the
nonce is a keyed hash of the message, where the key is the hash of the
long-term secret and new randomness). The literature identifies various
advantages and disadvantages of including these hash inputs:
* Including the message and a long-term secret protects against
signing-time RNG failures. This is a standard feature of Ed25519
signers.
* To the extent that the RNG works, including new randomness has the
advantage of stopping (e.g.) fault attacks that rely on a nonce
being reused for multiple signatures of the same message.
* Including new randomness has the disadvantage of requiring state
for the RNG. However, lib25519 runs within an OS that in any case
maintains state and provides an RNG.
* Including new randomness also has the disadvantage of interfering
with the use of test vectors. This disadvantage does not apply to
lib25519: lib25519's test vectors already handle randomness.
lib25519 includes a few further steps that could be useful in stopping
fault attacks (for example, signature verification internally converts
invalid public keys to the key (...,26), which does not have a known
discrete logarithm), but in general lib25519 should be presumed
breakable by fault attacks.
+17
View File
@@ -0,0 +1,17 @@
track history more precisely in documentation for individual source files
consider symlinks from files in build tree to source tree (via a build/source symlink so build can link elsewhere)
allow shared-* for API functions (requires tweaking dispatch)
speedups for more architectures
speedups for more microarchitectures
consider faster PRF for the keyed hash giving the nonce
merge subroutines in source to the extent possible
scan for and remove any unused functions and files
restructure for more merging at object-code level
sort object files (for, e.g., improved cache utilization)
optionally allow post-installation patching of current cpu as an exceptional cpuid
(based on benchmarks and, with more CPU time, full functionality tests)
dispatch: eliminate, e.g., avx2 if avx is higher priority
speed up dispatch cpuid tests (lazy evaluation, merging cpuid calls)
randombytes: support getrandom, getentropy
verify constbranch, constindex
full functional verification
+51
View File
@@ -0,0 +1,51 @@
crypto_verify/32
#define crypto_verify_32_BYTES 32
int crypto_verify(const unsigned char *,const unsigned char *);
crypto_hashblocks/sha512 f0bc623a9033f9f648336540e11e85be21aeb60905c7d8808d10ea20b39d58d1 f1a2c46c9ce7fa4cd22f180907d77b6f7189badef4b9a1b5284d6fb9db859b76
#define crypto_hashblocks_sha512_STATEBYTES 64
#define crypto_hashblocks_sha512_BLOCKBYTES 128
int crypto_hashblocks(unsigned char *,const unsigned char *,long long);
crypto_hash/sha512 8220572f58bd4730be165c9739d8d4b0fd2e0229dbe01e25b4aed23f00f23b70 c1e322b7cbfc941260c5508967ba05bce22eeee94d425e708b7c3301ea1d5e2e
#define crypto_hash_sha512_BYTES 64
void crypto_hash(unsigned char *,const unsigned char *,long long);
crypto_pow/inv25519 ad2062946e82718da820226504991a85c5fe56bdbff959c1313f837ee13b37be 59b3045a01e1fca2a86a0280aee8b985c5e040afdc0d3e85ed87eb97a46a4dd6
#define crypto_pow_inv25519_BYTES 32
void crypto_pow(unsigned char *,const unsigned char *);
crypto_nP/montgomery25519 b861d66109b42359e5994ed57ae566827c345b65a9d0671700320b82888397ec 740924011f3448f65299f61b087f74a6eb9651a4203dfbf621d2bec54e149405
#define crypto_nP_montgomery25519_SCALARBYTES 32
#define crypto_nP_montgomery25519_POINTBYTES 32
void crypto_nP(unsigned char *,const unsigned char *,const unsigned char *);
crypto_nG/merged25519 a4e761839798a07817484e97605bd63215b4938934ed9ce01935bbced48155bc 0a01c09fc8a8c7e8c18f841b2e1b2da9c156868737d194d223b03531cf2db731
#define crypto_nG_merged25519_SCALARBYTES 32
#define crypto_nG_merged25519_POINTBYTES 32
crypto_nG/montgomery25519 5c8a5d8b32e3d26b33071779ce9191095d7bd4ab3bb6a40b68976e41a98cfc3b 2becc8cd065820fcf82e53a03c5b5235582480fc11d072f2bd15153aebd4e057
#define crypto_nG_montgomery25519_SCALARBYTES 32
#define crypto_nG_montgomery25519_POINTBYTES 32
void crypto_nG(unsigned char *,const unsigned char *);
crypto_mGnP/ed25519 dc80be44fb0d482c5ae430779e76fe612c53fcd9e5847254bf27ab34e90745f4 9e1a3b7015c8fdb12763fd88494f5bfe9e2565ead4d3407d5ecf7ff6ca24c1d0
#define crypto_mGnP_ed25519_MBYTES 32
#define crypto_mGnP_ed25519_NBYTES 64
#define crypto_mGnP_ed25519_PBYTES 32
#define crypto_mGnP_ed25519_OUTPUTBYTES 33
void crypto_mGnP(unsigned char *,const unsigned char *,const unsigned char *,const unsigned char *);
crypto_dh/x25519 2c8a73ec86d5d4c4bc838f49cfd78c87b60b534ae6fff59ce3bea0c32cdc1450 b09016b3a1371786b46a183085133338159e623c5eb9cbc5eaa4f8b62d6c5aea
#define crypto_dh_x25519_SECRETKEYBYTES 32
#define crypto_dh_x25519_PUBLICKEYBYTES 32
#define crypto_dh_x25519_BYTES 32
void crypto_dh_keypair(unsigned char *,unsigned char *);
void crypto_dh(unsigned char *,const unsigned char *,const unsigned char *);
crypto_sign/ed25519 ce11fd7c1eac4dd0bc5eec49b26ad1e91aef696fae50ce377dbd806dc394da01 2ed857f17c917a8185e6c296303a11772ae45683a5e7cb5b095489bad65fffde
#define crypto_sign_ed25519_SECRETKEYBYTES 64
#define crypto_sign_ed25519_PUBLICKEYBYTES 32
#define crypto_sign_ed25519_BYTES 64
void crypto_sign_keypair(unsigned char *,unsigned char *);
void crypto_sign(unsigned char *,long long *,const unsigned char *,long long,const unsigned char *);
int crypto_sign_open(unsigned char *,long long *,const unsigned char *,long long,const unsigned char *);
+338
View File
@@ -0,0 +1,338 @@
#!/usr/bin/env python3
output = r'''/* WARNING: auto-generated (by autogen-speed); do not edit */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/resource.h>
#include "cpucycles.h" /* -lcpucycles */
#include "lib25519.h" /* -l25519 */
#include "randombytes.h" /* -lrandombytes_kernel */
static const char *targeto = 0;
static const char *targetp = 0;
static const char *targeti = 0;
#include "limits.inc"
static unsigned char *alignedcalloc(unsigned long long len)
{
unsigned char *x = (unsigned char *) calloc(1,len + 128);
if (!x) abort();
/* will never deallocate so shifting is ok */
x += 63 & (-(unsigned long) x);
return x;
}
#define TIMINGS 15
static long long t[TIMINGS+1];
static void t_print(const char *op,long long impl,long long len)
{
long long median = 0;
printf("%s",op);
if (impl >= 0)
printf(" %lld",impl);
else
printf(" selected");
printf(" %lld",len);
for (long long i = 0;i < TIMINGS;++i)
t[i] = t[i+1]-t[i];
for (long long j = 0;j < TIMINGS;++j) {
long long belowj = 0;
long long abovej = 0;
for (long long i = 0;i < TIMINGS;++i) if (t[i] < t[j]) ++belowj;
for (long long i = 0;i < TIMINGS;++i) if (t[i] > t[j]) ++abovej;
if (belowj*2 < TIMINGS && abovej*2 < TIMINGS) {
median = t[j];
break;
}
}
printf(" %lld ",median);
for (long long i = 0;i < TIMINGS;++i)
printf("%+lld",t[i]-median);
printf("\n");
fflush(stdout);
}
#define MAXTEST_BYTES 65536
static void measure_cpucycles(void)
{
printf("cpucycles selected persecond %lld\n",cpucycles_persecond());
printf("cpucycles selected implementation %s\n",cpucycles_implementation());
for (long long i = 0;i <= TIMINGS;++i)
t[i] = cpucycles();
t_print("cpucycles",-1,0);
}
static void measure_randombytes(void)
{
unsigned char *m = alignedcalloc(MAXTEST_BYTES);
long long mlen = 0;
while (mlen < MAXTEST_BYTES) {
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
randombytes(m,mlen);
}
t_print("randombytes",-1,mlen);
mlen += 1+mlen/2;
}
}
'''
# XXX: integrate todo into api
todo = (
('verify',(
('x','lib25519_verify_BYTES'),
('y','lib25519_verify_BYTES'),
),(
('crypto_verify','x','y'),
)),
('hashblocks',(
('h','lib25519_hashblocks_STATEBYTES'),
('m','MAXTEST_BYTES'),
('mlen',None),
),(
('crypto_hashblocks','h','m','mlen'),
)),
('hash',(
('h','lib25519_hash_BYTES'),
('m','MAXTEST_BYTES'),
('mlen',None),
),(
('crypto_hash','h','m','mlen'),
)),
('pow',(
('n','lib25519_pow_BYTES'),
('ne','lib25519_pow_BYTES'),
),(
('crypto_pow','ne','n'),
)),
('nP',(
('n','lib25519_nP_SCALARBYTES'),
('P','lib25519_nP_POINTBYTES'),
('nP','lib25519_nP_POINTBYTES'),
),(
('crypto_nP','nP','n','P'),
)),
('nG',(
('n','lib25519_nP_SCALARBYTES'),
('nG','lib25519_nP_POINTBYTES'),
),(
('crypto_nG','nG','n'),
)),
('mGnP',(
('mGnP','lib25519_mGnP_OUTPUTBYTES'),
('m','lib25519_mGnP_MBYTES'),
('n','lib25519_mGnP_NBYTES'),
('P','lib25519_mGnP_PBYTES'),
),(
('crypto_mGnP','mGnP','m','n','P'),
)),
('dh',(
('pka','lib25519_dh_PUBLICKEYBYTES'),
('ska','lib25519_dh_SECRETKEYBYTES'),
('pkb','lib25519_dh_PUBLICKEYBYTES'),
('skb','lib25519_dh_SECRETKEYBYTES'),
('ka','lib25519_dh_BYTES'),
),(
('crypto_dh_keypair','pka','ska'),
('crypto_dh_keypair','pkb','skb'),
('crypto_dh','ka','pkb','ska'),
)),
('sign',(
('pk','lib25519_sign_PUBLICKEYBYTES'),
('sk','lib25519_sign_SECRETKEYBYTES'),
('m','MAXTEST_BYTES+lib25519_sign_BYTES'),
('sm','MAXTEST_BYTES+lib25519_sign_BYTES'),
('m2','MAXTEST_BYTES+lib25519_sign_BYTES'),
('mlen',None),
('smlen',None),
('m2len',None),
),(
('crypto_sign_keypair','pk','sk'),
('crypto_sign','sm','&smlen','m','mlen','sk'),
('crypto_sign_open','m2','&m2len','sm','smlen','pk'),
)),
)
operations = []
primitives = {}
sizes = {}
exports = {}
prototypes = {}
with open('api') as f:
for line in f:
line = line.strip()
if line.startswith('crypto_'):
x = line.split()
x = x[0].split('/')
assert len(x) == 2
o = x[0].split('_')[1]
if o not in operations: operations += [o]
p = x[1]
if o not in primitives: primitives[o] = []
primitives[o] += [p]
continue
if line.startswith('#define '):
x = line.split(' ')
x = x[1].split('_')
assert len(x) == 4
assert x[0] == 'crypto'
o = x[1]
p = x[2]
if (o,p) not in sizes: sizes[o,p] = ''
sizes[o,p] += line+'\n'
continue
if line.endswith(');'):
fun,args = line[:-2].split('(')
rettype,fun = fun.split()
fun = fun.split('_')
o = fun[1]
assert fun[0] == 'crypto'
if o not in exports: exports[o] = []
exports[o] += ['_'.join(fun[1:])]
if o not in prototypes: prototypes[o] = []
prototypes[o] += [(rettype,fun,args)]
for t in todo:
o,vars,benches = t
for p in primitives[o]:
output += '\n'
output += 'static void measure_%s_%s(void)\n' % (o,p)
output += '{\n'
output += ' if (targeto && strcmp(targeto,"%s")) return;\n' % o
output += ' if (targetp && strcmp(targetp,"%s")) return;\n' % p
varsize = {}
for v,size in vars:
if size is None:
output += ' long long %s;\n' % v
else:
size = size.replace('lib25519_'+o,'lib25519_'+o+'_'+p)
output += ' unsigned char *%s = alignedcalloc(%s);\n' % (v,size)
varsize[v] = size
output += '\n'
output += ' for (long long impl = -1;impl < lib25519_numimpl_%s_%s();++impl) {\n' % (o,p)
for rettype,fun,args in prototypes[o]:
output += ' %s (*%s)(%s);\n' % (rettype,'_'.join(fun),args)
output += ' if (targeti && strcmp(targeti,lib25519_dispatch_%s_%s_implementation(impl))) continue;\n' % (o,p)
output += ' if (impl >= 0) {\n'
for rettype,fun,args in prototypes[o]:
f2 = ['lib25519','dispatch',o,p]+fun[2:]
output += ' %s = %s(impl);\n' % ('_'.join(fun),'_'.join(f2))
output += r' printf("%s_%s %%lld implementation %%s compiler %%s\n",impl,lib25519_dispatch_%s_%s_implementation(impl),lib25519_dispatch_%s_%s_compiler(impl));' % (o,p,o,p,o,p)
output += '\n'
output += ' } else {\n'
for rettype,fun,args in prototypes[o]:
f2 = ['lib25519',o,p]+fun[2:]
output += ' %s = %s;\n' % ('_'.join(fun),'_'.join(f2))
output += r' printf("%s_%s selected implementation %%s compiler %%s\n",lib25519_%s_%s_implementation(),lib25519_%s_%s_compiler());' % (o,p,o,p,o,p)
output += '\n'
output += ' }\n'
for v,size in vars:
if size is not None:
size = size.replace('lib25519_'+o,'lib25519_'+o+'_'+p)
output += ' randombytes(%s,%s);\n' % (v,size)
alreadybenched = set()
alreadybenched.add('assert')
for b in benches:
if b[0] in alreadybenched:
output += ' %s(%s);\n' % (b[0],','.join(b[1:]))
continue
fun = b[0].split('_')
shortfun = '_'.join([o,p]+fun[2:])
alreadybenched.add(b[0])
if 'mlen' in b[1:] or 'smlen' in b[1:]:
output += ' mlen = 0;\n'
output += ' while (mlen <= MAXTEST_BYTES) {\n'
output += ' randombytes(m,mlen);\n'
if shortfun == 'sign_ed25519_open': # XXX: put this into todo
output += ' lib25519_sign(sm,&smlen,m,mlen,sk);\n'
output += ' for (long long i = 0;i <= TIMINGS;++i) {\n'
output += ' t[i] = cpucycles();\n'
output += ' %s(%s);\n' % (b[0],','.join(b[1:]))
output += ' }\n'
output += ' t_print("%s",impl,mlen);\n' % (shortfun)
if shortfun == 'sign_ed25519_open': # XXX: put this into todo
output += ' /* this is, in principle, not a test program */\n'
output += ' /* but some checks here help validate the data flow above */\n'
output += ' assert(m2len == mlen);\n'
output += ' assert(!memcmp(m,m2,mlen));\n'
if o == 'sign': # XXX: put this into todo
output += ' mlen += 1+mlen/4;\n'
else:
output += ' mlen += 1+mlen/2;\n'
output += ' }\n'
else:
output += ' for (long long i = 0;i <= TIMINGS;++i) {\n'
output += ' t[i] = cpucycles();\n'
output += ' %s(%s);\n' % (b[0],','.join(b[1:]))
output += ' }\n'
output += ' t_print("%s",impl,%s);\n' % (shortfun,varsize[b[1]])
output += ' }\n'
output += '}\n'
output += r'''
#include "print_cpuid.inc"
int main(int argc,char **argv)
{
printf("lib25519 version %s\n",lib25519_version);
printf("lib25519 arch %s\n",lib25519_arch);
print_cpuid();
if (*argv) ++argv;
if (*argv) {
targeto = *argv++;
if (*argv) {
targetp = *argv++;
if (*argv) {
targeti = *argv++;
}
}
}
measure_cpucycles();
measure_randombytes();
limits();
'''
for t in todo:
o,vars,benches = t
for p in primitives[o]:
output += ' measure_%s_%s();\n' % (o,p)
output += r'''
return 0;
}
'''
with open('command/lib25519-speed.c','w') as f:
f.write(output)
+1217
View File
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+466
View File
@@ -0,0 +1,466 @@
/* WARNING: auto-generated (by autogen-speed); do not edit */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/resource.h>
#include "cpucycles.h" /* -lcpucycles */
#include "lib25519.h" /* -l25519 */
#include "randombytes.h" /* -lrandombytes_kernel */
static const char *targeto = 0;
static const char *targetp = 0;
static const char *targeti = 0;
#include "limits.inc"
static unsigned char *alignedcalloc(unsigned long long len)
{
unsigned char *x = (unsigned char *) calloc(1,len + 128);
if (!x) abort();
/* will never deallocate so shifting is ok */
x += 63 & (-(unsigned long) x);
return x;
}
#define TIMINGS 15
static long long t[TIMINGS+1];
static void t_print(const char *op,long long impl,long long len)
{
long long median = 0;
printf("%s",op);
if (impl >= 0)
printf(" %lld",impl);
else
printf(" selected");
printf(" %lld",len);
for (long long i = 0;i < TIMINGS;++i)
t[i] = t[i+1]-t[i];
for (long long j = 0;j < TIMINGS;++j) {
long long belowj = 0;
long long abovej = 0;
for (long long i = 0;i < TIMINGS;++i) if (t[i] < t[j]) ++belowj;
for (long long i = 0;i < TIMINGS;++i) if (t[i] > t[j]) ++abovej;
if (belowj*2 < TIMINGS && abovej*2 < TIMINGS) {
median = t[j];
break;
}
}
printf(" %lld ",median);
for (long long i = 0;i < TIMINGS;++i)
printf("%+lld",t[i]-median);
printf("\n");
fflush(stdout);
}
#define MAXTEST_BYTES 65536
static void measure_cpucycles(void)
{
printf("cpucycles selected persecond %lld\n",cpucycles_persecond());
printf("cpucycles selected implementation %s\n",cpucycles_implementation());
for (long long i = 0;i <= TIMINGS;++i)
t[i] = cpucycles();
t_print("cpucycles",-1,0);
}
static void measure_randombytes(void)
{
unsigned char *m = alignedcalloc(MAXTEST_BYTES);
long long mlen = 0;
while (mlen < MAXTEST_BYTES) {
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
randombytes(m,mlen);
}
t_print("randombytes",-1,mlen);
mlen += 1+mlen/2;
}
}
static void measure_verify_32(void)
{
if (targeto && strcmp(targeto,"verify")) return;
if (targetp && strcmp(targetp,"32")) return;
unsigned char *x = alignedcalloc(lib25519_verify_32_BYTES);
unsigned char *y = alignedcalloc(lib25519_verify_32_BYTES);
for (long long impl = -1;impl < lib25519_numimpl_verify_32();++impl) {
int (*crypto_verify)(const unsigned char *,const unsigned char *);
if (targeti && strcmp(targeti,lib25519_dispatch_verify_32_implementation(impl))) continue;
if (impl >= 0) {
crypto_verify = lib25519_dispatch_verify_32(impl);
printf("verify_32 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_verify_32_implementation(impl),lib25519_dispatch_verify_32_compiler(impl));
} else {
crypto_verify = lib25519_verify_32;
printf("verify_32 selected implementation %s compiler %s\n",lib25519_verify_32_implementation(),lib25519_verify_32_compiler());
}
randombytes(x,lib25519_verify_32_BYTES);
randombytes(y,lib25519_verify_32_BYTES);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_verify(x,y);
}
t_print("verify_32",impl,lib25519_verify_32_BYTES);
}
}
static void measure_hashblocks_sha512(void)
{
if (targeto && strcmp(targeto,"hashblocks")) return;
if (targetp && strcmp(targetp,"sha512")) return;
unsigned char *h = alignedcalloc(lib25519_hashblocks_sha512_STATEBYTES);
unsigned char *m = alignedcalloc(MAXTEST_BYTES);
long long mlen;
for (long long impl = -1;impl < lib25519_numimpl_hashblocks_sha512();++impl) {
int (*crypto_hashblocks)(unsigned char *,const unsigned char *,long long);
if (targeti && strcmp(targeti,lib25519_dispatch_hashblocks_sha512_implementation(impl))) continue;
if (impl >= 0) {
crypto_hashblocks = lib25519_dispatch_hashblocks_sha512(impl);
printf("hashblocks_sha512 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_hashblocks_sha512_implementation(impl),lib25519_dispatch_hashblocks_sha512_compiler(impl));
} else {
crypto_hashblocks = lib25519_hashblocks_sha512;
printf("hashblocks_sha512 selected implementation %s compiler %s\n",lib25519_hashblocks_sha512_implementation(),lib25519_hashblocks_sha512_compiler());
}
randombytes(h,lib25519_hashblocks_sha512_STATEBYTES);
randombytes(m,MAXTEST_BYTES);
mlen = 0;
while (mlen <= MAXTEST_BYTES) {
randombytes(m,mlen);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_hashblocks(h,m,mlen);
}
t_print("hashblocks_sha512",impl,mlen);
mlen += 1+mlen/2;
}
}
}
static void measure_hash_sha512(void)
{
if (targeto && strcmp(targeto,"hash")) return;
if (targetp && strcmp(targetp,"sha512")) return;
unsigned char *h = alignedcalloc(lib25519_hash_sha512_BYTES);
unsigned char *m = alignedcalloc(MAXTEST_BYTES);
long long mlen;
for (long long impl = -1;impl < lib25519_numimpl_hash_sha512();++impl) {
void (*crypto_hash)(unsigned char *,const unsigned char *,long long);
if (targeti && strcmp(targeti,lib25519_dispatch_hash_sha512_implementation(impl))) continue;
if (impl >= 0) {
crypto_hash = lib25519_dispatch_hash_sha512(impl);
printf("hash_sha512 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_hash_sha512_implementation(impl),lib25519_dispatch_hash_sha512_compiler(impl));
} else {
crypto_hash = lib25519_hash_sha512;
printf("hash_sha512 selected implementation %s compiler %s\n",lib25519_hash_sha512_implementation(),lib25519_hash_sha512_compiler());
}
randombytes(h,lib25519_hash_sha512_BYTES);
randombytes(m,MAXTEST_BYTES);
mlen = 0;
while (mlen <= MAXTEST_BYTES) {
randombytes(m,mlen);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_hash(h,m,mlen);
}
t_print("hash_sha512",impl,mlen);
mlen += 1+mlen/2;
}
}
}
static void measure_pow_inv25519(void)
{
if (targeto && strcmp(targeto,"pow")) return;
if (targetp && strcmp(targetp,"inv25519")) return;
unsigned char *n = alignedcalloc(lib25519_pow_inv25519_BYTES);
unsigned char *ne = alignedcalloc(lib25519_pow_inv25519_BYTES);
for (long long impl = -1;impl < lib25519_numimpl_pow_inv25519();++impl) {
void (*crypto_pow)(unsigned char *,const unsigned char *);
if (targeti && strcmp(targeti,lib25519_dispatch_pow_inv25519_implementation(impl))) continue;
if (impl >= 0) {
crypto_pow = lib25519_dispatch_pow_inv25519(impl);
printf("pow_inv25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_pow_inv25519_implementation(impl),lib25519_dispatch_pow_inv25519_compiler(impl));
} else {
crypto_pow = lib25519_pow_inv25519;
printf("pow_inv25519 selected implementation %s compiler %s\n",lib25519_pow_inv25519_implementation(),lib25519_pow_inv25519_compiler());
}
randombytes(n,lib25519_pow_inv25519_BYTES);
randombytes(ne,lib25519_pow_inv25519_BYTES);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_pow(ne,n);
}
t_print("pow_inv25519",impl,lib25519_pow_inv25519_BYTES);
}
}
static void measure_nP_montgomery25519(void)
{
if (targeto && strcmp(targeto,"nP")) return;
if (targetp && strcmp(targetp,"montgomery25519")) return;
unsigned char *n = alignedcalloc(lib25519_nP_montgomery25519_SCALARBYTES);
unsigned char *P = alignedcalloc(lib25519_nP_montgomery25519_POINTBYTES);
unsigned char *nP = alignedcalloc(lib25519_nP_montgomery25519_POINTBYTES);
for (long long impl = -1;impl < lib25519_numimpl_nP_montgomery25519();++impl) {
void (*crypto_nP)(unsigned char *,const unsigned char *,const unsigned char *);
if (targeti && strcmp(targeti,lib25519_dispatch_nP_montgomery25519_implementation(impl))) continue;
if (impl >= 0) {
crypto_nP = lib25519_dispatch_nP_montgomery25519(impl);
printf("nP_montgomery25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_nP_montgomery25519_implementation(impl),lib25519_dispatch_nP_montgomery25519_compiler(impl));
} else {
crypto_nP = lib25519_nP_montgomery25519;
printf("nP_montgomery25519 selected implementation %s compiler %s\n",lib25519_nP_montgomery25519_implementation(),lib25519_nP_montgomery25519_compiler());
}
randombytes(n,lib25519_nP_montgomery25519_SCALARBYTES);
randombytes(P,lib25519_nP_montgomery25519_POINTBYTES);
randombytes(nP,lib25519_nP_montgomery25519_POINTBYTES);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_nP(nP,n,P);
}
t_print("nP_montgomery25519",impl,lib25519_nP_montgomery25519_POINTBYTES);
}
}
static void measure_nG_merged25519(void)
{
if (targeto && strcmp(targeto,"nG")) return;
if (targetp && strcmp(targetp,"merged25519")) return;
unsigned char *n = alignedcalloc(lib25519_nP_SCALARBYTES);
unsigned char *nG = alignedcalloc(lib25519_nP_POINTBYTES);
for (long long impl = -1;impl < lib25519_numimpl_nG_merged25519();++impl) {
void (*crypto_nG)(unsigned char *,const unsigned char *);
if (targeti && strcmp(targeti,lib25519_dispatch_nG_merged25519_implementation(impl))) continue;
if (impl >= 0) {
crypto_nG = lib25519_dispatch_nG_merged25519(impl);
printf("nG_merged25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_nG_merged25519_implementation(impl),lib25519_dispatch_nG_merged25519_compiler(impl));
} else {
crypto_nG = lib25519_nG_merged25519;
printf("nG_merged25519 selected implementation %s compiler %s\n",lib25519_nG_merged25519_implementation(),lib25519_nG_merged25519_compiler());
}
randombytes(n,lib25519_nP_SCALARBYTES);
randombytes(nG,lib25519_nP_POINTBYTES);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_nG(nG,n);
}
t_print("nG_merged25519",impl,lib25519_nP_POINTBYTES);
}
}
static void measure_nG_montgomery25519(void)
{
if (targeto && strcmp(targeto,"nG")) return;
if (targetp && strcmp(targetp,"montgomery25519")) return;
unsigned char *n = alignedcalloc(lib25519_nP_SCALARBYTES);
unsigned char *nG = alignedcalloc(lib25519_nP_POINTBYTES);
for (long long impl = -1;impl < lib25519_numimpl_nG_montgomery25519();++impl) {
void (*crypto_nG)(unsigned char *,const unsigned char *);
if (targeti && strcmp(targeti,lib25519_dispatch_nG_montgomery25519_implementation(impl))) continue;
if (impl >= 0) {
crypto_nG = lib25519_dispatch_nG_montgomery25519(impl);
printf("nG_montgomery25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_nG_montgomery25519_implementation(impl),lib25519_dispatch_nG_montgomery25519_compiler(impl));
} else {
crypto_nG = lib25519_nG_montgomery25519;
printf("nG_montgomery25519 selected implementation %s compiler %s\n",lib25519_nG_montgomery25519_implementation(),lib25519_nG_montgomery25519_compiler());
}
randombytes(n,lib25519_nP_SCALARBYTES);
randombytes(nG,lib25519_nP_POINTBYTES);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_nG(nG,n);
}
t_print("nG_montgomery25519",impl,lib25519_nP_POINTBYTES);
}
}
static void measure_mGnP_ed25519(void)
{
if (targeto && strcmp(targeto,"mGnP")) return;
if (targetp && strcmp(targetp,"ed25519")) return;
unsigned char *mGnP = alignedcalloc(lib25519_mGnP_ed25519_OUTPUTBYTES);
unsigned char *m = alignedcalloc(lib25519_mGnP_ed25519_MBYTES);
unsigned char *n = alignedcalloc(lib25519_mGnP_ed25519_NBYTES);
unsigned char *P = alignedcalloc(lib25519_mGnP_ed25519_PBYTES);
for (long long impl = -1;impl < lib25519_numimpl_mGnP_ed25519();++impl) {
void (*crypto_mGnP)(unsigned char *,const unsigned char *,const unsigned char *,const unsigned char *);
if (targeti && strcmp(targeti,lib25519_dispatch_mGnP_ed25519_implementation(impl))) continue;
if (impl >= 0) {
crypto_mGnP = lib25519_dispatch_mGnP_ed25519(impl);
printf("mGnP_ed25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_mGnP_ed25519_implementation(impl),lib25519_dispatch_mGnP_ed25519_compiler(impl));
} else {
crypto_mGnP = lib25519_mGnP_ed25519;
printf("mGnP_ed25519 selected implementation %s compiler %s\n",lib25519_mGnP_ed25519_implementation(),lib25519_mGnP_ed25519_compiler());
}
randombytes(mGnP,lib25519_mGnP_ed25519_OUTPUTBYTES);
randombytes(m,lib25519_mGnP_ed25519_MBYTES);
randombytes(n,lib25519_mGnP_ed25519_NBYTES);
randombytes(P,lib25519_mGnP_ed25519_PBYTES);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_mGnP(mGnP,m,n,P);
}
t_print("mGnP_ed25519",impl,lib25519_mGnP_ed25519_OUTPUTBYTES);
}
}
static void measure_dh_x25519(void)
{
if (targeto && strcmp(targeto,"dh")) return;
if (targetp && strcmp(targetp,"x25519")) return;
unsigned char *pka = alignedcalloc(lib25519_dh_x25519_PUBLICKEYBYTES);
unsigned char *ska = alignedcalloc(lib25519_dh_x25519_SECRETKEYBYTES);
unsigned char *pkb = alignedcalloc(lib25519_dh_x25519_PUBLICKEYBYTES);
unsigned char *skb = alignedcalloc(lib25519_dh_x25519_SECRETKEYBYTES);
unsigned char *ka = alignedcalloc(lib25519_dh_x25519_BYTES);
for (long long impl = -1;impl < lib25519_numimpl_dh_x25519();++impl) {
void (*crypto_dh_keypair)(unsigned char *,unsigned char *);
void (*crypto_dh)(unsigned char *,const unsigned char *,const unsigned char *);
if (targeti && strcmp(targeti,lib25519_dispatch_dh_x25519_implementation(impl))) continue;
if (impl >= 0) {
crypto_dh_keypair = lib25519_dispatch_dh_x25519_keypair(impl);
crypto_dh = lib25519_dispatch_dh_x25519(impl);
printf("dh_x25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_dh_x25519_implementation(impl),lib25519_dispatch_dh_x25519_compiler(impl));
} else {
crypto_dh_keypair = lib25519_dh_x25519_keypair;
crypto_dh = lib25519_dh_x25519;
printf("dh_x25519 selected implementation %s compiler %s\n",lib25519_dh_x25519_implementation(),lib25519_dh_x25519_compiler());
}
randombytes(pka,lib25519_dh_x25519_PUBLICKEYBYTES);
randombytes(ska,lib25519_dh_x25519_SECRETKEYBYTES);
randombytes(pkb,lib25519_dh_x25519_PUBLICKEYBYTES);
randombytes(skb,lib25519_dh_x25519_SECRETKEYBYTES);
randombytes(ka,lib25519_dh_x25519_BYTES);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_dh_keypair(pka,ska);
}
t_print("dh_x25519_keypair",impl,lib25519_dh_x25519_PUBLICKEYBYTES);
crypto_dh_keypair(pkb,skb);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_dh(ka,pkb,ska);
}
t_print("dh_x25519",impl,lib25519_dh_x25519_BYTES);
}
}
static void measure_sign_ed25519(void)
{
if (targeto && strcmp(targeto,"sign")) return;
if (targetp && strcmp(targetp,"ed25519")) return;
unsigned char *pk = alignedcalloc(lib25519_sign_ed25519_PUBLICKEYBYTES);
unsigned char *sk = alignedcalloc(lib25519_sign_ed25519_SECRETKEYBYTES);
unsigned char *m = alignedcalloc(MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
unsigned char *sm = alignedcalloc(MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
unsigned char *m2 = alignedcalloc(MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
long long mlen;
long long smlen;
long long m2len;
for (long long impl = -1;impl < lib25519_numimpl_sign_ed25519();++impl) {
void (*crypto_sign_keypair)(unsigned char *,unsigned char *);
void (*crypto_sign)(unsigned char *,long long *,const unsigned char *,long long,const unsigned char *);
int (*crypto_sign_open)(unsigned char *,long long *,const unsigned char *,long long,const unsigned char *);
if (targeti && strcmp(targeti,lib25519_dispatch_sign_ed25519_implementation(impl))) continue;
if (impl >= 0) {
crypto_sign_keypair = lib25519_dispatch_sign_ed25519_keypair(impl);
crypto_sign = lib25519_dispatch_sign_ed25519(impl);
crypto_sign_open = lib25519_dispatch_sign_ed25519_open(impl);
printf("sign_ed25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_sign_ed25519_implementation(impl),lib25519_dispatch_sign_ed25519_compiler(impl));
} else {
crypto_sign_keypair = lib25519_sign_ed25519_keypair;
crypto_sign = lib25519_sign_ed25519;
crypto_sign_open = lib25519_sign_ed25519_open;
printf("sign_ed25519 selected implementation %s compiler %s\n",lib25519_sign_ed25519_implementation(),lib25519_sign_ed25519_compiler());
}
randombytes(pk,lib25519_sign_ed25519_PUBLICKEYBYTES);
randombytes(sk,lib25519_sign_ed25519_SECRETKEYBYTES);
randombytes(m,MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
randombytes(sm,MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
randombytes(m2,MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_sign_keypair(pk,sk);
}
t_print("sign_ed25519_keypair",impl,lib25519_sign_ed25519_PUBLICKEYBYTES);
mlen = 0;
while (mlen <= MAXTEST_BYTES) {
randombytes(m,mlen);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_sign(sm,&smlen,m,mlen,sk);
}
t_print("sign_ed25519",impl,mlen);
mlen += 1+mlen/4;
}
mlen = 0;
while (mlen <= MAXTEST_BYTES) {
randombytes(m,mlen);
lib25519_sign(sm,&smlen,m,mlen,sk);
for (long long i = 0;i <= TIMINGS;++i) {
t[i] = cpucycles();
crypto_sign_open(m2,&m2len,sm,smlen,pk);
}
t_print("sign_ed25519_open",impl,mlen);
/* this is, in principle, not a test program */
/* but some checks here help validate the data flow above */
assert(m2len == mlen);
assert(!memcmp(m,m2,mlen));
mlen += 1+mlen/4;
}
}
}
#include "print_cpuid.inc"
int main(int argc,char **argv)
{
printf("lib25519 version %s\n",lib25519_version);
printf("lib25519 arch %s\n",lib25519_arch);
print_cpuid();
if (*argv) ++argv;
if (*argv) {
targeto = *argv++;
if (*argv) {
targetp = *argv++;
if (*argv) {
targeti = *argv++;
}
}
}
measure_cpucycles();
measure_randombytes();
limits();
measure_verify_32();
measure_hashblocks_sha512();
measure_hash_sha512();
measure_pow_inv25519();
measure_nP_montgomery25519();
measure_nG_merged25519();
measure_nG_montgomery25519();
measure_mGnP_ed25519();
measure_dh_x25519();
measure_sign_ed25519();
return 0;
}
File diff suppressed because it is too large Load Diff
+17
View File
@@ -0,0 +1,17 @@
static void limits()
{
#ifdef RLIM_INFINITY
struct rlimit r;
r.rlim_cur = 0;
r.rlim_max = 0;
#ifdef RLIMIT_NOFILE
setrlimit(RLIMIT_NOFILE,&r);
#endif
#ifdef RLIMIT_NPROC
setrlimit(RLIMIT_NPROC,&r);
#endif
#ifdef RLIMIT_CORE
setrlimit(RLIMIT_CORE,&r);
#endif
#endif
}
@@ -0,0 +1,9 @@
static void print_cpuid(void)
{
unsigned int cpuid[32];
lib25519_cpuid(cpuid,32);
printf("cpuid");
for (long long j = 0;j < 32;++j)
printf(" %08x",cpuid[j]);
printf("\n");
}
+2
View File
@@ -0,0 +1,2 @@
gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mtune=sandybridge
clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mtune=sandybridge
@@ -0,0 +1,2 @@
gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -mtune=haswell
clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -mtune=haswell
@@ -0,0 +1,2 @@
gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mtune=skylake
clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mtune=skylake
@@ -0,0 +1,2 @@
gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mavx512f -mavx512vl -mtune=skylake
clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mavx512f -mavx512vl -mtune=skylake
@@ -0,0 +1,2 @@
gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mavx512f -mavx512vl -mavx512ifma -mtune=skylake
clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mavx512f -mavx512vl -mavx512ifma -mtune=skylake
@@ -0,0 +1,37 @@
#define CPUID(func,leaf,a,b,c,d) \
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
#define WANT_1_3 ((1<<23)|(1<<25)|(1<<26))
/* 23=mmx; 25=sse; 26=sse2 */
#define WANT_1_2 ((1<<0)|(1<<9)|(1<<19)|(1<<20)|(1<<27)|(1<<28))
/* 0=sse3; 9=ssse3; 19=sse41; 20=sse42; 27=osxsave; 28=avx */
#define WANT_7_1 ((1<<3)|(1<<5)|(1<<8)|(1<<16)|(1<<19)|(1<<21)|(1<<31))
/* 3=bmi1; 5=avx2; 8=bmi2; 16=avx512_f; 19=adx; 21=avx512_ifma; 31=avx512_vl */
#define WANT_XCR ((1<<1)|(1<<2))
/* 1=xmm; 2=ymm */
int supports(void)
{
unsigned int cpuidmax,id0,id1,id2;
unsigned int familymodelstepping;
unsigned int feature0,feature1,feature2,feature3;
unsigned int xcrlow,xcrhigh;
CPUID(0,0,cpuidmax,id0,id1,id2);
if (cpuidmax < 7) return 0;
CPUID(1,0,familymodelstepping,feature1,feature2,feature3);
if (WANT_1_2 != (WANT_1_2 & feature2)) return 0;
if (WANT_1_3 != (WANT_1_3 & feature3)) return 0;
CPUID(7,0,feature0,feature1,feature2,feature3);
if (WANT_7_1 != (WANT_7_1 & feature1)) return 0;
asm(".byte 15;.byte 1;.byte 208":"=a"(xcrlow),"=d"(xcrhigh):"c"(0));
if (WANT_XCR != (WANT_XCR & xcrlow)) return 0;
return 1;
}
@@ -0,0 +1,37 @@
#define CPUID(func,leaf,a,b,c,d) \
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
#define WANT_1_3 ((1<<23)|(1<<25)|(1<<26))
/* 23=mmx; 25=sse; 26=sse2 */
#define WANT_1_2 ((1<<0)|(1<<9)|(1<<19)|(1<<20)|(1<<27)|(1<<28))
/* 0=sse3; 9=ssse3; 19=sse41; 20=sse42; 27=osxsave; 28=avx */
#define WANT_7_1 ((1<<3)|(1<<5)|(1<<8)|(1<<16)|(1<<19)|(1<<31))
/* 3=bmi1; 5=avx2; 8=bmi2; 16=avx512_f; 19=adx; 31=avx512_vl */
#define WANT_XCR ((1<<1)|(1<<2))
/* 1=xmm; 2=ymm */
int supports(void)
{
unsigned int cpuidmax,id0,id1,id2;
unsigned int familymodelstepping;
unsigned int feature0,feature1,feature2,feature3;
unsigned int xcrlow,xcrhigh;
CPUID(0,0,cpuidmax,id0,id1,id2);
if (cpuidmax < 7) return 0;
CPUID(1,0,familymodelstepping,feature1,feature2,feature3);
if (WANT_1_2 != (WANT_1_2 & feature2)) return 0;
if (WANT_1_3 != (WANT_1_3 & feature3)) return 0;
CPUID(7,0,feature0,feature1,feature2,feature3);
if (WANT_7_1 != (WANT_7_1 & feature1)) return 0;
asm(".byte 15;.byte 1;.byte 208":"=a"(xcrlow),"=d"(xcrhigh):"c"(0));
if (WANT_XCR != (WANT_XCR & xcrlow)) return 0;
return 1;
}
@@ -0,0 +1,37 @@
#define CPUID(func,leaf,a,b,c,d) \
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
#define WANT_1_3 ((1<<23)|(1<<25)|(1<<26))
/* 23=mmx; 25=sse; 26=sse2 */
#define WANT_1_2 ((1<<0)|(1<<9)|(1<<19)|(1<<20)|(1<<27)|(1<<28))
/* 0=sse3; 9=ssse3; 19=sse41; 20=sse42; 27=osxsave; 28=avx */
#define WANT_7_1 ((1<<3)|(1<<5)|(1<<8)|(1<<19))
/* 3=bmi1; 5=avx2; 8=bmi2; 19=adx */
#define WANT_XCR ((1<<1)|(1<<2))
/* 1=xmm; 2=ymm */
int supports(void)
{
unsigned int cpuidmax,id0,id1,id2;
unsigned int familymodelstepping;
unsigned int feature0,feature1,feature2,feature3;
unsigned int xcrlow,xcrhigh;
CPUID(0,0,cpuidmax,id0,id1,id2);
if (cpuidmax < 7) return 0;
CPUID(1,0,familymodelstepping,feature1,feature2,feature3);
if (WANT_1_2 != (WANT_1_2 & feature2)) return 0;
if (WANT_1_3 != (WANT_1_3 & feature3)) return 0;
CPUID(7,0,feature0,feature1,feature2,feature3);
if (WANT_7_1 != (WANT_7_1 & feature1)) return 0;
asm(".byte 15;.byte 1;.byte 208":"=a"(xcrlow),"=d"(xcrhigh):"c"(0));
if (WANT_XCR != (WANT_XCR & xcrlow)) return 0;
return 1;
}
@@ -0,0 +1,53 @@
/*
gcc has __builtin_cpu_supports("avx2")
but implemented it incorrectly until 2018:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85100
as of 2022, many machines still have buggy versions of gcc
furthermore, why is checking just for avx2 enough?
has intel guaranteed that it will never introduce
a cpu with avx2 instructions and without (e.g.) sse4.2?
so manually check cpuid and xgetbv here
and include all the "lower" instruction sets
rather than trying to guess which ones are implied
*/
#define CPUID(func,leaf,a,b,c,d) \
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
#define WANT_1_3 ((1<<23)|(1<<25)|(1<<26))
/* 23=mmx; 25=sse; 26=sse2 */
#define WANT_1_2 ((1<<0)|(1<<9)|(1<<19)|(1<<20)|(1<<27)|(1<<28))
/* 0=sse3; 9=ssse3; 19=sse41; 20=sse42; 27=osxsave; 28=avx */
#define WANT_7_1 ((1<<3)|(1<<5)|(1<<8))
/* 3=bmi1; 5=avx2; 8=bmi2 */
#define WANT_XCR ((1<<1)|(1<<2))
/* 1=xmm; 2=ymm */
int supports(void)
{
unsigned int cpuidmax,id0,id1,id2;
unsigned int familymodelstepping;
unsigned int feature0,feature1,feature2,feature3;
unsigned int xcrlow,xcrhigh;
CPUID(0,0,cpuidmax,id0,id1,id2);
if (cpuidmax < 7) return 0;
CPUID(1,0,familymodelstepping,feature1,feature2,feature3);
if (WANT_1_2 != (WANT_1_2 & feature2)) return 0;
if (WANT_1_3 != (WANT_1_3 & feature3)) return 0;
CPUID(7,0,feature0,feature1,feature2,feature3);
if (WANT_7_1 != (WANT_7_1 & feature1)) return 0;
asm(".byte 15;.byte 1;.byte 208":"=a"(xcrlow),"=d"(xcrhigh):"c"(0));
if (WANT_XCR != (WANT_XCR & xcrlow)) return 0;
return 1;
}
+31
View File
@@ -0,0 +1,31 @@
#define CPUID(func,leaf,a,b,c,d) \
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
#define WANT_1_3 ((1<<23)|(1<<25)|(1<<26))
/* 23=mmx; 25=sse; 26=sse2 */
#define WANT_1_2 ((1<<0)|(1<<9)|(1<<19)|(1<<20)|(1<<27)|(1<<28))
/* 0=sse3; 9=ssse3; 19=sse41; 20=sse42; 27=osxsave; 28=avx */
#define WANT_XCR ((1<<1)|(1<<2))
/* 1=xmm; 2=ymm */
int supports(void)
{
unsigned int cpuidmax,id0,id1,id2;
unsigned int familymodelstepping;
unsigned int feature1,feature2,feature3;
unsigned int xcrlow,xcrhigh;
CPUID(0,0,cpuidmax,id0,id1,id2);
if (cpuidmax < 1) return 0;
CPUID(1,0,familymodelstepping,feature1,feature2,feature3);
if (WANT_1_2 != (WANT_1_2 & feature2)) return 0;
if (WANT_1_3 != (WANT_1_3 & feature3)) return 0;
asm(".byte 15;.byte 1;.byte 208":"=a"(xcrlow),"=d"(xcrhigh):"c"(0));
if (WANT_XCR != (WANT_XCR & xcrlow)) return 0;
return 1;
}
+2
View File
@@ -0,0 +1,2 @@
gcc -Wall -fPIC -fwrapv -O2
clang -Wall -fPIC -fwrapv -Qunused-arguments -O2
Vendored Executable
+1159
View File
File diff suppressed because it is too large Load Diff
+46
View File
@@ -0,0 +1,46 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <linux/perf_event.h>
static struct perf_event_attr attr;
static int fdperf = -1;
static struct perf_event_mmap_page *buf = 0;
long long ticks_setup(void)
{
if (fdperf == -1) {
attr.type = PERF_TYPE_HARDWARE;
attr.config = PERF_COUNT_HW_CPU_CYCLES;
attr.exclude_kernel = 1;
fdperf = syscall(__NR_perf_event_open,&attr,0,-1,-1,0);
if (fdperf == -1) return 0;
buf = mmap(NULL,sysconf(_SC_PAGESIZE),PROT_READ,MAP_SHARED,fdperf,0);
}
return -1;
}
long long ticks(void)
{
long long result;
unsigned int seq;
long long index;
long long offset;
do {
seq = buf->lock;
asm volatile("" ::: "memory");
index = buf->index;
offset = buf->offset;
asm volatile("rdpmc;shlq $32,%%rdx;orq %%rdx,%%rax"
: "=a"(result) : "c"(index-1) : "%rdx");
asm volatile("" ::: "memory");
} while (buf->lock != seq);
result += offset;
result &= 0xffffffffffff;
return result;
}
+12
View File
@@ -0,0 +1,12 @@
long long ticks_setup(void)
{
return -2;
}
long long ticks(void)
{
unsigned long long result;
asm volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
: "=a"(result) :: "%rdx");
return result;
}
+9
View File
@@ -0,0 +1,9 @@
#ifndef cpucycles_h
#define cpucycles_h
extern long long (*cpucycles)(void) __attribute__((visibility("default")));;
extern long long cpucycles_init(void) __attribute__((visibility("default")));;
extern const char *cpucycles_implementation(void) __attribute__((visibility("default")));;
extern long long cpucycles_persecond(void) __attribute__((visibility("default")));;
#endif
@@ -0,0 +1,19 @@
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
long long ticks_setup(void)
{
return 1000000;
}
long long ticks(void)
{
struct timeval t;
long long result;
gettimeofday(&t,(struct timezone *) 0);
result = t.tv_sec;
result *= 1000000;
result += t.tv_usec;
return result;
}
@@ -0,0 +1,18 @@
#include <time.h>
#include <sys/time.h>
long long ticks_setup(void)
{
return 1000000000;
}
long long ticks(void)
{
struct timespec t;
long long result;
clock_gettime(CLOCK_MONOTONIC,&t);
result = t.tv_sec;
result *= 1000000000;
result += t.tv_nsec;
return result;
}
@@ -0,0 +1,9 @@
long long ticks_setup(void)
{
return 0;
}
long long ticks(void)
{
return 0;
}
+4
View File
@@ -0,0 +1,4 @@
amd64-pmc
amd64-tsc
default-monotonic
default-gettimeofday
+216
View File
@@ -0,0 +1,216 @@
#include <stdio.h>
#include <stdlib.h>
#include "cpucycles.h"
static double osfreq(void)
{
FILE *f;
char *x;
double result;
int s;
f = fopen("/etc/cpucyclespersecond", "r");
if (f) {
s = fscanf(f,"%lf",&result);
fclose(f);
if (s > 0) return result;
}
f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed", "r");
if (f) {
s = fscanf(f,"%lf",&result);
fclose(f);
if (s > 0) return 1000.0 * result;
}
f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", "r");
if (f) {
s = fscanf(f,"%lf",&result);
fclose(f);
if (s > 0) return 1000.0 * result;
}
f = fopen("/sys/devices/system/cpu/cpu0/clock_tick", "r");
if (f) {
s = fscanf(f,"%lf",&result);
fclose(f);
if (s > 0) return result;
}
f = fopen("/proc/cpuinfo","r");
if (f) {
for (;;) {
s = fscanf(f,"cpu MHz : %lf",&result);
if (s > 0) break;
if (s == 0) s = fscanf(f,"%*[^\n]\n");
if (s < 0) { result = 0; break; }
}
fclose(f);
if (result) return 1000000.0 * result;
}
f = fopen("/proc/cpuinfo","r");
if (f) {
for (;;) {
s = fscanf(f,"clock : %lf",&result);
if (s > 0) break;
if (s == 0) s = fscanf(f,"%*[^\n]\n");
if (s < 0) { result = 0; break; }
}
fclose(f);
if (result) return 1000000.0 * result;
}
f = popen("sysctl hw.cpufrequency 2>/dev/null","r");
if (f) {
s = fscanf(f,"hw.cpufrequency: %lf",&result);
pclose(f);
if (s > 0) if (result > 0) return result;
}
f = popen("/usr/sbin/lsattr -E -l proc0 -a frequency 2>/dev/null","r");
if (f) {
s = fscanf(f,"frequency %lf",&result);
pclose(f);
if (s > 0) return result;
}
f = popen("/usr/sbin/psrinfo -v 2>/dev/null","r");
if (f) {
for (;;) {
s = fscanf(f," The %*s processor operates at %lf MHz",&result);
if (s > 0) break;
if (s == 0) s = fscanf(f,"%*[^\n]\n");
if (s < 0) { result = 0; break; }
}
pclose(f);
if (result) return 1000000.0 * result;
}
x = getenv("cpucyclespersecond");
if (x) {
s = sscanf(x,"%lf",&result);
if (s > 0) return result;
}
return 0;
}
static long long persecond = 0;
const char *implementation = "none";
long long (*cpucycles)(void) = cpucycles_init;
const char *cpucycles_implementation(void)
{
cpucycles();
return implementation;
}
long long cpucycles_persecond(void)
{
cpucycles();
return persecond;
}
static double cpucycles_scaled_scaling = 0;
static long long (*cpucycles_scaled_from)(void) = 0;
static long long cpucycles_scaled(void)
{
return cpucycles_scaled_from()*cpucycles_scaled_scaling;
}
#include "options.inc"
#define CALLS 1000
long long cpucycles_init(void)
{
long long precision[NUMOPTIONS];
long long scaling[NUMOPTIONS];
long long bestprecision;
long long bestopt;
persecond = osfreq();
for (long long opt = 0;opt < NUMOPTIONS;++opt) {
long long freq = options[opt].ticks_setup();
// freq > 0: freq ticks per second
// freq == 0: do not use
// freq == -1: cycle counter (e.g., rdpmc)
// freq == -2: probably cycle counter (e.g., rdtsc)
// freq == -3: tick counter every N cycles for some unknown N
precision[opt] = 0;
if (freq > 0) { // means: freq ticks per second
scaling[opt] = persecond*1.0/freq;
} else if (freq == -1) { // means: cycle counter; e.g., rdpmc
scaling[opt] = 1.0;
} else if (freq == -2) { // means: probably cycle counter; e.g., rdtsc
scaling[opt] = 1.0;
} else {
continue;
}
for (long long tries = 0;tries < 10;++tries) {
long long t[CALLS+1];
long long ok = 1;
if (scaling[opt] == 1.0) {
for (long long i = 0;i <= CALLS;++i)
t[i] = options[opt].ticks();
} else {
double scalingopt = scaling[opt];
for (long long i = 0;i <= CALLS;++i)
t[i] = options[opt].ticks()*scalingopt;
}
for (long long i = 0;i < CALLS;++i)
if (t[i] > t[i+1])
ok = 0;
if (t[0] == t[CALLS])
ok = 0;
if (ok) {
long long smallestdiff = 0;
for (long long i = 0;i < CALLS;++i) {
long long diff = t[i+1]-t[i];
if (diff <= 0) continue;
if (smallestdiff == 0 || diff < smallestdiff)
smallestdiff = diff;
}
precision[opt] = smallestdiff;
if (freq != -1)
precision[opt] += 100;
break;
}
// otherwise keep trying
// since !ok can be caused by overflow
// or by core swap
}
}
bestopt = DEFAULTOPTION;
bestprecision = 0;
for (long long opt = 0;opt < NUMOPTIONS;++opt)
if (precision[opt] > 0)
if (!bestprecision || precision[opt] < bestprecision) {
bestopt = opt;
bestprecision = precision[opt];
}
implementation = options[bestopt].implementation;
if (scaling[bestopt] == 1.0) {
cpucycles = options[bestopt].ticks;
} else {
cpucycles_scaled_scaling = scaling[bestopt];
cpucycles_scaled_from = options[bestopt].ticks;
cpucycles = cpucycles_scaled;
}
return cpucycles();
}
+73
View File
@@ -0,0 +1,73 @@
#include <stdio.h>
#define CPUID(func,leaf,a,b,c,d) \
__asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
__attribute__((visibility("default")))
void lib25519_cpuid(unsigned int *result,long long resultlen)
{
unsigned int a,b,c,d;
unsigned int cpuidmax,extendedcpuidmax;
int havexgetbv = 0;
CPUID(0,0,a,b,c,d);
cpuidmax = a;
if (resultlen > 0) { *result++ = b; --resultlen; }
if (resultlen > 0) { *result++ = c; --resultlen; }
if (resultlen > 0) { *result++ = d; --resultlen; }
a = b = c = d = 0;
CPUID(0x80000000,0,a,b,c,d);
extendedcpuidmax = a;
a = b = c = d = 0;
if (extendedcpuidmax >= 0x80000002) CPUID(0x80000002,0,a,b,c,d);
if (resultlen > 0) { *result++ = a; --resultlen; }
if (resultlen > 0) { *result++ = b; --resultlen; }
if (resultlen > 0) { *result++ = c; --resultlen; }
if (resultlen > 0) { *result++ = d; --resultlen; }
a = b = c = d = 0;
if (extendedcpuidmax >= 0x80000003) CPUID(0x80000003,0,a,b,c,d);
if (resultlen > 0) { *result++ = a; --resultlen; }
if (resultlen > 0) { *result++ = b; --resultlen; }
if (resultlen > 0) { *result++ = c; --resultlen; }
if (resultlen > 0) { *result++ = d; --resultlen; }
a = b = c = d = 0;
if (extendedcpuidmax >= 0x80000004) CPUID(0x80000004,0,a,b,c,d);
if (resultlen > 0) { *result++ = a; --resultlen; }
if (resultlen > 0) { *result++ = b; --resultlen; }
if (resultlen > 0) { *result++ = c; --resultlen; }
if (resultlen > 0) { *result++ = d; --resultlen; }
a = b = c = d = 0;
if (cpuidmax >= 1) CPUID(1,0,a,b,c,d);
if (resultlen > 0) { *result++ = a; --resultlen; }
if (resultlen > 0) { *result++ = b; --resultlen; }
if (resultlen > 0) { *result++ = c; --resultlen; }
if (resultlen > 0) { *result++ = d; --resultlen; }
/* 27=osxsave; 28=avx */
if (((1<<27)|(1<<28)) == (((1<<27)|(1<<28)) & c))
havexgetbv = 1;
a = b = c = d = 0;
if (cpuidmax >= 7) CPUID(7,0,a,b,c,d);
if (resultlen > 0) { *result++ = a; --resultlen; }
if (resultlen > 0) { *result++ = b; --resultlen; }
if (resultlen > 0) { *result++ = c; --resultlen; }
if (resultlen > 0) { *result++ = d; --resultlen; }
a = b = c = d = 0;
if (extendedcpuidmax >= 0x80000001) CPUID(0x80000001,0,a,b,c,d);
if (resultlen > 0) { *result++ = a; --resultlen; }
if (resultlen > 0) { *result++ = b; --resultlen; }
if (resultlen > 0) { *result++ = c; --resultlen; }
if (resultlen > 0) { *result++ = d; --resultlen; }
a = b = c = d = 0;
if (havexgetbv) asm(".byte 15;.byte 1;.byte 208":"=a"(a),"=d"(d):"c"(0));
if (resultlen > 0) { *result++ = a; --resultlen; }
while (resultlen > 0) { *result++ = 0; --resultlen; }
}
+7
View File
@@ -0,0 +1,7 @@
#include <stdio.h>
__attribute__((visibility("default")))
void lib25519_cpuid(unsigned int *result,long long resultlen)
{
while (resultlen > 0) { *result++ = 0; --resultlen; }
}
@@ -0,0 +1,9 @@
#include "crypto_nP_montgomery25519.h"
#include "crypto_dh.h"
void crypto_dh(unsigned char *abshared,
const unsigned char *bobpk,
const unsigned char *alicesk)
{
crypto_nP_montgomery25519(abshared,alicesk,bobpk);
}
@@ -0,0 +1,9 @@
#include "crypto_nG_montgomery25519.h"
#include "randombytes.h"
#include "crypto_dh.h"
void crypto_dh_keypair(unsigned char *pk,unsigned char *sk)
{
randombytes(sk,crypto_dh_SECRETKEYBYTES);
crypto_nG_montgomery25519(pk,sk);
}
@@ -0,0 +1 @@
../ref/api.h
@@ -0,0 +1 @@
../../../crypto_pow/inv25519/sandy2x/architectures
@@ -0,0 +1,78 @@
#include <immintrin.h>
#include "crypto_hashblocks_sha512.h"
#include "crypto_hash.h"
#define blocks crypto_hashblocks_sha512
#define ALIGNED __attribute((aligned(32)))
static const ALIGNED unsigned char iv[64] = {
0x6a,0x09,0xe6,0x67,0xf3,0xbc,0xc9,0x08,
0xbb,0x67,0xae,0x85,0x84,0xca,0xa7,0x3b,
0x3c,0x6e,0xf3,0x72,0xfe,0x94,0xf8,0x2b,
0xa5,0x4f,0xf5,0x3a,0x5f,0x1d,0x36,0xf1,
0x51,0x0e,0x52,0x7f,0xad,0xe6,0x82,0xd1,
0x9b,0x05,0x68,0x8c,0x2b,0x3e,0x6c,0x1f,
0x1f,0x83,0xd9,0xab,0xfb,0x41,0xbd,0x6b,
0x5b,0xe0,0xcd,0x19,0x13,0x7e,0x21,0x79
} ;
typedef unsigned long long uint64;
#define load256(x) (_mm256_loadu_si256((void *) (x)))
#define store256(x,y) (_mm256_storeu_si256((void *) (x),y))
void crypto_hash(unsigned char *out,const unsigned char *in,long long inlen)
{
ALIGNED unsigned char h[64];
ALIGNED unsigned char padded[256];
unsigned long long i;
unsigned long long bytes = inlen;
__m256i X0,X1;
X0 = load256(iv);
X1 = load256(iv + 32);
store256(h,X0);
store256(h + 32,X1);
blocks(h,in,inlen);
in += inlen;
inlen &= 127;
in -= inlen;
X0 ^= X0;
if (inlen < 112) {
store256(padded,X0);
store256(padded + 32,X0);
store256(padded + 64,X0);
store256(padded + 96,X0);
for (i = 0;i < inlen;++i) padded[i] = in[i];
padded[inlen] = 0x80;
padded[119] = bytes >> 61;
*(uint64 *) (padded + 120) = __builtin_bswap64(bytes << 3);
blocks(h,padded,128);
} else {
store256(padded + 96,X0);
store256(padded + 128,X0);
store256(padded + 160,X0);
store256(padded + 192,X0);
store256(padded + 224,X0);
for (i = 0;i < inlen;++i) padded[i] = in[i];
padded[inlen] = 0x80;
padded[247] = bytes >> 61;
*(uint64 *) (padded + 248) = __builtin_bswap64(bytes << 3);
blocks(h,padded,256);
}
X0 = load256(h);
X1 = load256(h + 32);
store256(out,X0);
store256(out + 32,X1);
}
@@ -0,0 +1 @@
../ref/implementors
@@ -0,0 +1 @@
#define CRYPTO_BYTES 64
@@ -0,0 +1,69 @@
/*
20080913
D. J. Bernstein
Public domain.
*/
#include "crypto_hashblocks_sha512.h"
#include "crypto_hash.h"
#define blocks crypto_hashblocks_sha512
static const unsigned char iv[64] = {
0x6a,0x09,0xe6,0x67,0xf3,0xbc,0xc9,0x08,
0xbb,0x67,0xae,0x85,0x84,0xca,0xa7,0x3b,
0x3c,0x6e,0xf3,0x72,0xfe,0x94,0xf8,0x2b,
0xa5,0x4f,0xf5,0x3a,0x5f,0x1d,0x36,0xf1,
0x51,0x0e,0x52,0x7f,0xad,0xe6,0x82,0xd1,
0x9b,0x05,0x68,0x8c,0x2b,0x3e,0x6c,0x1f,
0x1f,0x83,0xd9,0xab,0xfb,0x41,0xbd,0x6b,
0x5b,0xe0,0xcd,0x19,0x13,0x7e,0x21,0x79
} ;
typedef unsigned long long uint64;
void crypto_hash(unsigned char *out,const unsigned char *in,long long inlen)
{
unsigned char h[64];
unsigned char padded[256];
int i;
unsigned long long bytes = inlen;
for (i = 0;i < 64;++i) h[i] = iv[i];
blocks(h,in,inlen);
in += inlen;
inlen &= 127;
in -= inlen;
for (i = 0;i < inlen;++i) padded[i] = in[i];
padded[inlen] = 0x80;
if (inlen < 112) {
for (i = inlen + 1;i < 119;++i) padded[i] = 0;
padded[119] = bytes >> 61;
padded[120] = bytes >> 53;
padded[121] = bytes >> 45;
padded[122] = bytes >> 37;
padded[123] = bytes >> 29;
padded[124] = bytes >> 21;
padded[125] = bytes >> 13;
padded[126] = bytes >> 5;
padded[127] = bytes << 3;
blocks(h,padded,128);
} else {
for (i = inlen + 1;i < 247;++i) padded[i] = 0;
padded[247] = bytes >> 61;
padded[248] = bytes >> 53;
padded[249] = bytes >> 45;
padded[250] = bytes >> 37;
padded[251] = bytes >> 29;
padded[252] = bytes >> 21;
padded[253] = bytes >> 13;
padded[254] = bytes >> 5;
padded[255] = bytes << 3;
blocks(h,padded,256);
}
for (i = 0;i < 64;++i) out[i] = h[i];
}
@@ -0,0 +1 @@
Daniel J. Bernstein (wrapper around crypto_hashblocks/sha512)
@@ -0,0 +1 @@
../m3/api.h
@@ -0,0 +1,2 @@
amd64 avx2
x86 avx2
@@ -0,0 +1 @@
../m3/constants.c
@@ -0,0 +1 @@
../m3/implementors
@@ -0,0 +1,249 @@
#include <immintrin.h>
#include "inner.h"
#define uint64 crypto_uint64
static uint64 load_bigendian(const unsigned char *x)
{
return __builtin_bswap64(*(uint64 *) x);
}
static void store_bigendian(unsigned char *x,uint64 u)
{
*(uint64 *) x = __builtin_bswap64(u);
}
#define SHR(x,c) ((x) >> (c))
#define ROTR(x,c) (((x) >> (c)) | ((x) << (64 - (c))))
#define sigma0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x,7))
#define sigma1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x,6))
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
#define Maj(x,y,z) ((x & (y ^ z)) ^ (y & z))
#define Sigma0(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
#define Sigma1(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
#define ALIGNED __attribute((aligned(32)))
#define load64(x) (*(uint64 *) (x))
#define store256(x,y) (*(volatile __m256i *) (x) = (y))
#define bigendian64 _mm256_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)
#define PREEXPANDx4(X0,X9,X1) \
X0 = _mm256_add_epi64(X0, \
_mm256_srli_epi64(X1,1) ^ _mm256_slli_epi64(X1,63) ^ \
_mm256_srli_epi64(X1,8) ^ _mm256_slli_epi64(X1,56) ^ \
_mm256_srli_epi64(X1,7) \
); \
X0 = _mm256_add_epi64(X0,X9);
#define POSTEXPANDx4(X0,W0,W2,W14) \
W0 = ( \
_mm256_extracti128_si256(X0,0)); \
W0 = _mm_add_epi64(W0, \
_mm_srli_epi64(W14,19) ^ _mm_slli_epi64(W14,45) ^ \
_mm_srli_epi64(W14,61) ^ _mm_slli_epi64(W14,3) ^ \
_mm_srli_epi64(W14,6)); \
W2 = ( \
_mm256_extracti128_si256(X0,1)); \
W2 = _mm_add_epi64(W2, \
_mm_srli_epi64(W0,19) ^ _mm_slli_epi64(W0,45) ^ \
_mm_srli_epi64(W0,61) ^ _mm_slli_epi64(W0,3) ^ \
_mm_srli_epi64(W0,6)); \
X0 = _mm256_insertf128_si256(_mm256_castsi128_si256(W0),W2,1);
#define ROUND0(i,r0,r1,r2,r3,r4,r5,r6,r7) \
r7 += load64(&wc[i]); \
r7 += Ch(r4,r5,r6); \
r7 += Sigma1(r4); \
r3 += r7; \
r7 += Maj(r2,r0,r1); \
r7 += Sigma0(r0); \
#define ROUND1(i,r0,r1,r2,r3,r4,r5,r6,r7) \
r7 += load64(&wc[i]); \
r7 += Ch(r4,r5,r6); \
r7 += Sigma1(r4); \
r3 += r7; \
r7 += Maj(r0,r1,r2); \
r7 += Sigma0(r0); \
int inner(unsigned char *statebytes,const unsigned char *in,unsigned int inlen,const uint64 *constants)
{
ALIGNED uint64 state[8];
ALIGNED uint64 w[20];
ALIGNED uint64 wc[16]; /* w[i]+constants[i] */
uint64 r0,r1,r2,r3,r4,r5,r6,r7;
__m128i W0,W2,W4,W6,W8,W10,W12,W14;
__m256i X0,X1,X4,X5,X8,X9,X12,X13;
__m256i D0,D4,D8,D12;
int i;
state[0] = r0 = load_bigendian(statebytes);
state[1] = r1 = load_bigendian(statebytes+8);
state[2] = r2 = load_bigendian(statebytes+16);
state[3] = r3 = load_bigendian(statebytes+24);
state[4] = r4 = load_bigendian(statebytes+32);
state[5] = r5 = load_bigendian(statebytes+40);
state[6] = r6 = load_bigendian(statebytes+48);
state[7] = r7 = load_bigendian(statebytes+56);
do {
X0 = _mm256_loadu_si256((void *) (in+0));
X0 = _mm256_shuffle_epi8(X0,bigendian64);
D0 = _mm256_loadu_si256((void *) &constants[0]);
D0 = _mm256_add_epi64(X0,D0);
store256(&wc[0],D0);
store256(&w[0],X0);
store256(&w[16],X0);
X4 = _mm256_loadu_si256((void *) (in+32));
X4 = _mm256_shuffle_epi8(X4,bigendian64);
D4 = _mm256_loadu_si256((void *) &constants[4]);
D4 = _mm256_add_epi64(X4,D4);
store256(&wc[4],D4);
store256(&w[4],X4);
ROUND0(0,r0,r1,r2,r3,r4,r5,r6,r7)
ROUND1(1,r7,r0,r1,r2,r3,r4,r5,r6)
X8 = _mm256_loadu_si256((void *) (in+64));
X8 = _mm256_shuffle_epi8(X8,bigendian64);
D8 = _mm256_loadu_si256((void *) &constants[8]);
D8 = _mm256_add_epi64(X8,D8);
store256(&wc[8],D8);
store256(&w[8],X8);
ROUND0(2,r6,r7,r0,r1,r2,r3,r4,r5)
ROUND1(3,r5,r6,r7,r0,r1,r2,r3,r4)
ROUND0(4,r4,r5,r6,r7,r0,r1,r2,r3)
ROUND1(5,r3,r4,r5,r6,r7,r0,r1,r2)
X12 = _mm256_loadu_si256((void *) (in+96));
X12 = _mm256_shuffle_epi8(X12,bigendian64);
D12 = _mm256_loadu_si256((void *) &constants[12]);
D12 = _mm256_add_epi64(X12,D12);
store256(&wc[12],D12);
store256(&w[12],X12);
ROUND0(6,r2,r3,r4,r5,r6,r7,r0,r1)
ROUND1(7,r1,r2,r3,r4,r5,r6,r7,r0)
ROUND0(8,r0,r1,r2,r3,r4,r5,r6,r7)
ROUND1(9,r7,r0,r1,r2,r3,r4,r5,r6)
for (i = 4;i > 0;--i) {
constants += 16;
X1 = _mm256_loadu_si256((void *) (w+1));
X9 = _mm256_loadu_si256((void *) (w+9));
PREEXPANDx4(X0,X9,X1)
W14 = _mm_loadu_si128((void *) (w+14));
POSTEXPANDx4(X0,W0,W2,W14)
D0 = _mm256_loadu_si256((void *) &constants[0]);
D0 = _mm256_add_epi64(X0,D0);
store256(&wc[0],D0);
store256(w+16,X0);
store256(w+0,X0);
ROUND0(10,r6,r7,r0,r1,r2,r3,r4,r5)
ROUND1(11,r5,r6,r7,r0,r1,r2,r3,r4)
ROUND0(12,r4,r5,r6,r7,r0,r1,r2,r3)
ROUND1(13,r3,r4,r5,r6,r7,r0,r1,r2)
X5 = _mm256_loadu_si256((void *) (w+5));
X13 = _mm256_loadu_si256((void *) (w+13));
PREEXPANDx4(X4,X13,X5)
W2 = _mm_loadu_si128((void *) (w+2));
POSTEXPANDx4(X4,W4,W6,W2)
D4 = _mm256_loadu_si256((void *) &constants[4]);
D4 = _mm256_add_epi64(X4,D4);
store256(&wc[4],D4);
store256(w+4,X4);
ROUND0(14,r2,r3,r4,r5,r6,r7,r0,r1)
ROUND1(15,r1,r2,r3,r4,r5,r6,r7,r0)
ROUND0(0,r0,r1,r2,r3,r4,r5,r6,r7)
ROUND1(1,r7,r0,r1,r2,r3,r4,r5,r6)
X9 = _mm256_loadu_si256((void *) (w+9));
X1 = _mm256_loadu_si256((void *) (w+1));
PREEXPANDx4(X8,X1,X9)
W6 = _mm_loadu_si128((void *) (w+6));
POSTEXPANDx4(X8,W8,W10,W6)
D8 = _mm256_loadu_si256((void *) &constants[8]);
D8 = _mm256_add_epi64(X8,D8);
store256(&wc[8],D8);
store256(w+8,X8);
ROUND0(2,r6,r7,r0,r1,r2,r3,r4,r5)
ROUND1(3,r5,r6,r7,r0,r1,r2,r3,r4)
ROUND0(4,r4,r5,r6,r7,r0,r1,r2,r3)
ROUND1(5,r3,r4,r5,r6,r7,r0,r1,r2)
X13 = _mm256_loadu_si256((void *) (w+13));
X5 = _mm256_loadu_si256((void *) (w+5));
PREEXPANDx4(X12,X5,X13)
W10 = _mm_loadu_si128((void *) (w+10));
POSTEXPANDx4(X12,W12,W14,W10)
D12 = _mm256_loadu_si256((void *) &constants[12]);
D12 = _mm256_add_epi64(X12,D12);
store256(&wc[12],D12);
store256(w+12,X12);
ROUND0(6,r2,r3,r4,r5,r6,r7,r0,r1)
ROUND1(7,r1,r2,r3,r4,r5,r6,r7,r0)
ROUND0(8,r0,r1,r2,r3,r4,r5,r6,r7)
ROUND1(9,r7,r0,r1,r2,r3,r4,r5,r6)
}
{
ROUND0(10,r6,r7,r0,r1,r2,r3,r4,r5)
ROUND1(11,r5,r6,r7,r0,r1,r2,r3,r4)
ROUND0(12,r4,r5,r6,r7,r0,r1,r2,r3)
ROUND1(13,r3,r4,r5,r6,r7,r0,r1,r2)
ROUND0(14,r2,r3,r4,r5,r6,r7,r0,r1)
ROUND1(15,r1,r2,r3,r4,r5,r6,r7,r0)
}
constants -= 64;
r0 += state[0]; state[0] = r0;
r1 += state[1]; state[1] = r1;
r2 += state[2]; state[2] = r2;
r3 += state[3]; state[3] = r3;
r4 += state[4]; state[4] = r4;
r5 += state[5]; state[5] = r5;
r6 += state[6]; state[6] = r6;
r7 += state[7]; state[7] = r7;
in += 128;
inlen -= 128;
} while (inlen >= 128);
for (i = 0;i < 8;++i)
store_bigendian(statebytes+8*i,state[i]);
return inlen;
}
@@ -0,0 +1,10 @@
#ifndef inner_h
#define inner_h
#define inner CRYPTO_NAMESPACE(inner)
#include "crypto_uint64.h"
extern int inner(unsigned char *,const unsigned char *,unsigned int,const crypto_uint64 *);
#endif
@@ -0,0 +1,68 @@
inner.S: \
round01.q \
round23.q \
round45.q \
round67.q \
round89.q \
round1011.q \
round1213.q \
round1415.q \
expand0.q \
expand4.q \
expand8.q \
expand12.q \
rer0.q \
rer4.q \
rer8.q \
rer12.q \
inner.q \
inner.S.do
./inner.S.do < inner.q > inner.S
round01.q: round.py
./round.py 0 > round01.q
round23.q: round.py
./round.py 1 > round23.q
round45.q: round.py
./round.py 2 > round45.q
round67.q: round.py
./round.py 3 > round67.q
round89.q: round.py
./round.py 4 > round89.q
round1011.q: round.py
./round.py 5 > round1011.q
round1213.q: round.py
./round.py 6 > round1213.q
round1415.q: round.py
./round.py 7 > round1415.q
expand0.q: expand.py
./expand.py 0 > expand0.q
expand4.q: expand.py
./expand.py 4 > expand4.q
expand8.q: expand.py
./expand.py 8 > expand8.q
expand12.q: expand.py
./expand.py 12 > expand12.q
rer0.q: rer.py
./rer.py 0 > rer0.q
rer4.q: rer.py
./rer.py 4 > rer4.q
rer8.q: rer.py
./rer.py 8 > rer8.q
rer12.q: rer.py
./rer.py 12 > rer12.q
@@ -0,0 +1 @@
../m3/api.h
@@ -0,0 +1,2 @@
amd64 bmi2 avx2
x86 bmi2 avx2
@@ -0,0 +1,104 @@
#include "crypto_hashblocks.h"
#include "inner.h"
static const crypto_uint64 constants[84] = {
0x428a2f98d728ae22ULL
, 0x7137449123ef65cdULL
, 0xb5c0fbcfec4d3b2fULL
, 0xe9b5dba58189dbbcULL
, 0x3956c25bf348b538ULL
, 0x59f111f1b605d019ULL
, 0x923f82a4af194f9bULL
, 0xab1c5ed5da6d8118ULL
, 0xd807aa98a3030242ULL
, 0x12835b0145706fbeULL
, 0x243185be4ee4b28cULL
, 0x550c7dc3d5ffb4e2ULL
, 0x72be5d74f27b896fULL
, 0x80deb1fe3b1696b1ULL
, 0x9bdc06a725c71235ULL
, 0xc19bf174cf692694ULL
, 0xe49b69c19ef14ad2ULL
, 0xefbe4786384f25e3ULL
, 0x0fc19dc68b8cd5b5ULL
, 0x240ca1cc77ac9c65ULL
, 0x2de92c6f592b0275ULL
, 0x4a7484aa6ea6e483ULL
, 0x5cb0a9dcbd41fbd4ULL
, 0x76f988da831153b5ULL
, 0x983e5152ee66dfabULL
, 0xa831c66d2db43210ULL
, 0xb00327c898fb213fULL
, 0xbf597fc7beef0ee4ULL
, 0xc6e00bf33da88fc2ULL
, 0xd5a79147930aa725ULL
, 0x06ca6351e003826fULL
, 0x142929670a0e6e70ULL
, 0x27b70a8546d22ffcULL
, 0x2e1b21385c26c926ULL
, 0x4d2c6dfc5ac42aedULL
, 0x53380d139d95b3dfULL
, 0x650a73548baf63deULL
, 0x766a0abb3c77b2a8ULL
, 0x81c2c92e47edaee6ULL
, 0x92722c851482353bULL
, 0xa2bfe8a14cf10364ULL
, 0xa81a664bbc423001ULL
, 0xc24b8b70d0f89791ULL
, 0xc76c51a30654be30ULL
, 0xd192e819d6ef5218ULL
, 0xd69906245565a910ULL
, 0xf40e35855771202aULL
, 0x106aa07032bbd1b8ULL
, 0x19a4c116b8d2d0c8ULL
, 0x1e376c085141ab53ULL
, 0x2748774cdf8eeb99ULL
, 0x34b0bcb5e19b48a8ULL
, 0x391c0cb3c5c95a63ULL
, 0x4ed8aa4ae3418acbULL
, 0x5b9cca4f7763e373ULL
, 0x682e6ff3d6b2b8a3ULL
, 0x748f82ee5defb2fcULL
, 0x78a5636f43172f60ULL
, 0x84c87814a1f0ab72ULL
, 0x8cc702081a6439ecULL
, 0x90befffa23631e28ULL
, 0xa4506cebde82bde9ULL
, 0xbef9a3f7b2c67915ULL
, 0xc67178f2e372532bULL
, 0xca273eceea26619cULL
, 0xd186b8c721c0c207ULL
, 0xeada7dd6cde0eb1eULL
, 0xf57d4f7fee6ed178ULL
, 0x06f067aa72176fbaULL
, 0x0a637dc5a2c898a6ULL
, 0x113f9804bef90daeULL
, 0x1b710b35131c471bULL
, 0x28db77f523047d84ULL
, 0x32caab7b40c72493ULL
, 0x3c9ebe0a15c9bebcULL
, 0x431d67c49c100d4cULL
, 0x4cc5d4becb3e42b6ULL
, 0x597f299cfc657e2aULL
, 0x5fcb6fab3ad6faecULL
, 0x6c44198c4a475817ULL
/* constants for the AVX code: */
, 0x0001020304050607ULL
, 0x08090a0b0c0d0e0fULL
, 0x0001020304050607ULL
, 0x08090a0b0c0d0e0fULL
};
#define CUTOFF 65536 /* must be multiple of 128 */
int crypto_hashblocks(unsigned char *statebytes,const unsigned char *in,long long inlen)
{
while (inlen >= CUTOFF) {
inner(statebytes,in,CUTOFF,constants); /* returns 0 */
in += CUTOFF;
inlen -= CUTOFF;
}
if (inlen < 128) return inlen;
return inner(statebytes,in,inlen,constants);
}
+57
View File
@@ -0,0 +1,57 @@
#!/usr/bin/env python3
import sys
for i in (0,4,8,12):
if len(sys.argv) > 1:
if sys.argv[1] != str(i):
continue
i0 = (i+0)&15
i1 = (i+1)&15
i9 = (i+9)&15
i14 = (i+14)&15
print(' X%d = mem256[&w + %d]' % (i1,8*i1))
print(' W%d = mem128[&w + %d],0' % (i14,8*i14))
print('')
print(' 4x X%dright1 = X%d unsigned>> 1' % (i1,i1))
print(' 4x X%dleft63 = X%d << 63' % (i1,i1))
print(' X%dsigma0 = X%dright1 ^ X%dleft63' % (i1,i1,i1))
print(' 4x X%dright8 = X%d unsigned>> 8' % (i1,i1))
print(' X%dsigma0 = X%dsigma0 ^ X%dright8' % (i1,i1,i1))
print(' 2x,0 W%dright19 = W%d unsigned>> 19' % (i14,i14))
print(' 4x X%dleft56 = X%d << 56' % (i1,i1))
print(' 2x,0 W%dleft45 = W%d << 45' % (i14,i14))
print(' X%dsigma0 = X%dsigma0 ^ X%dleft56' % (i1,i1,i1))
print(' 1x,0 W%dsigma1 = W%dright19 ^ W%dleft45' % (i14,i14,i14))
print(' 4x X%dright7 = X%d unsigned>> 7' % (i1,i1))
print(' 2x,0 W%dright61 = W%d unsigned>> 61' % (i14,i14))
print(' X%dsigma0 = X%dsigma0 ^ X%dright7' % (i1,i1,i1))
print(' 1x,0 W%dsigma1 ^= W%dright61' % (i14,i14))
print(' 4x X%d = X%d + X%dsigma0' % (i0,i0,i1))
print(' 2x,0 W%dleft3 = W%d << 3' % (i14,i14))
print(' 4x X%d = X%d + mem256[&w + %d]' % (i0,i0,8*i9))
print(' 1x,0 W%dsigma1 ^= W%dleft3' % (i14,i14))
print(' 2x,0 W%dright6 = W%d unsigned>> 6' % (i14,i14))
print(' 1x,0 W%dsigma1 ^= W%dright6' % (i14,i14))
print(' 4x X%d = W%dsigma1 + X%d' % (i0,i14,i0))
print('')
print(' 2x,0 W%dright19 = X%d unsigned>> 19' % (i0,i0))
print(' 2x,0 W%dleft45 = X%d << 45' % (i0,i0))
print(' 1x,0 W%dsigma1 = W%dright19 ^ W%dleft45' % (i0,i0,i0))
print(' 2x,0 W%dright61 = X%d unsigned>> 61' % (i0,i0))
print(' 1x,0 W%dsigma1 ^= W%dright61' % (i0,i0))
print(' 2x,0 W%dleft3 = X%d << 3' % (i0,i0))
print(' 1x,0 W%dsigma1 ^= W%dleft3' % (i0,i0))
print(' 2x,0 W%dright6 = X%d unsigned>> 6' % (i0,i0))
print(' 1x,0 W%dsigma1 ^= W%dright6' % (i0,i0))
print(' W%dsigma1 = W%dsigma1[1],W%dsigma1[0]' % (i0,i0,i0))
print('')
print(' 4x X%d = X%d + W%dsigma1' % (i0,i0,i0))
if i == 0:
print(' mem256[&w + 128] = X%d' % (i0))
print(' mem256[&w + %d] = X%d' % (8*i0,i0))
print(' 4x D%d = X%d + mem256[constants + %d]' % (i0,i0,8*i0))
print(' wc%d%d%d%d = D%d' % (i,i+1,i+2,i+3,i0))
print('')
@@ -0,0 +1,42 @@
X1 = mem256[&w + 8]
W14 = mem128[&w + 112],0
4x X1right1 = X1 unsigned>> 1
4x X1left63 = X1 << 63
X1sigma0 = X1right1 ^ X1left63
4x X1right8 = X1 unsigned>> 8
X1sigma0 = X1sigma0 ^ X1right8
2x,0 W14right19 = W14 unsigned>> 19
4x X1left56 = X1 << 56
2x,0 W14left45 = W14 << 45
X1sigma0 = X1sigma0 ^ X1left56
1x,0 W14sigma1 = W14right19 ^ W14left45
4x X1right7 = X1 unsigned>> 7
2x,0 W14right61 = W14 unsigned>> 61
X1sigma0 = X1sigma0 ^ X1right7
1x,0 W14sigma1 ^= W14right61
4x X0 = X0 + X1sigma0
2x,0 W14left3 = W14 << 3
4x X0 = X0 + mem256[&w + 72]
1x,0 W14sigma1 ^= W14left3
2x,0 W14right6 = W14 unsigned>> 6
1x,0 W14sigma1 ^= W14right6
4x X0 = W14sigma1 + X0
2x,0 W0right19 = X0 unsigned>> 19
2x,0 W0left45 = X0 << 45
1x,0 W0sigma1 = W0right19 ^ W0left45
2x,0 W0right61 = X0 unsigned>> 61
1x,0 W0sigma1 ^= W0right61
2x,0 W0left3 = X0 << 3
1x,0 W0sigma1 ^= W0left3
2x,0 W0right6 = X0 unsigned>> 6
1x,0 W0sigma1 ^= W0right6
W0sigma1 = W0sigma1[1],W0sigma1[0]
4x X0 = X0 + W0sigma1
mem256[&w + 128] = X0
mem256[&w + 0] = X0
4x D0 = X0 + mem256[constants + 0]
wc0123 = D0
@@ -0,0 +1,41 @@
X13 = mem256[&w + 104]
W10 = mem128[&w + 80],0
4x X13right1 = X13 unsigned>> 1
4x X13left63 = X13 << 63
X13sigma0 = X13right1 ^ X13left63
4x X13right8 = X13 unsigned>> 8
X13sigma0 = X13sigma0 ^ X13right8
2x,0 W10right19 = W10 unsigned>> 19
4x X13left56 = X13 << 56
2x,0 W10left45 = W10 << 45
X13sigma0 = X13sigma0 ^ X13left56
1x,0 W10sigma1 = W10right19 ^ W10left45
4x X13right7 = X13 unsigned>> 7
2x,0 W10right61 = W10 unsigned>> 61
X13sigma0 = X13sigma0 ^ X13right7
1x,0 W10sigma1 ^= W10right61
4x X12 = X12 + X13sigma0
2x,0 W10left3 = W10 << 3
4x X12 = X12 + mem256[&w + 40]
1x,0 W10sigma1 ^= W10left3
2x,0 W10right6 = W10 unsigned>> 6
1x,0 W10sigma1 ^= W10right6
4x X12 = W10sigma1 + X12
2x,0 W12right19 = X12 unsigned>> 19
2x,0 W12left45 = X12 << 45
1x,0 W12sigma1 = W12right19 ^ W12left45
2x,0 W12right61 = X12 unsigned>> 61
1x,0 W12sigma1 ^= W12right61
2x,0 W12left3 = X12 << 3
1x,0 W12sigma1 ^= W12left3
2x,0 W12right6 = X12 unsigned>> 6
1x,0 W12sigma1 ^= W12right6
W12sigma1 = W12sigma1[1],W12sigma1[0]
4x X12 = X12 + W12sigma1
mem256[&w + 96] = X12
4x D12 = X12 + mem256[constants + 96]
wc12131415 = D12
@@ -0,0 +1,41 @@
X5 = mem256[&w + 40]
W2 = mem128[&w + 16],0
4x X5right1 = X5 unsigned>> 1
4x X5left63 = X5 << 63
X5sigma0 = X5right1 ^ X5left63
4x X5right8 = X5 unsigned>> 8
X5sigma0 = X5sigma0 ^ X5right8
2x,0 W2right19 = W2 unsigned>> 19
4x X5left56 = X5 << 56
2x,0 W2left45 = W2 << 45
X5sigma0 = X5sigma0 ^ X5left56
1x,0 W2sigma1 = W2right19 ^ W2left45
4x X5right7 = X5 unsigned>> 7
2x,0 W2right61 = W2 unsigned>> 61
X5sigma0 = X5sigma0 ^ X5right7
1x,0 W2sigma1 ^= W2right61
4x X4 = X4 + X5sigma0
2x,0 W2left3 = W2 << 3
4x X4 = X4 + mem256[&w + 104]
1x,0 W2sigma1 ^= W2left3
2x,0 W2right6 = W2 unsigned>> 6
1x,0 W2sigma1 ^= W2right6
4x X4 = W2sigma1 + X4
2x,0 W4right19 = X4 unsigned>> 19
2x,0 W4left45 = X4 << 45
1x,0 W4sigma1 = W4right19 ^ W4left45
2x,0 W4right61 = X4 unsigned>> 61
1x,0 W4sigma1 ^= W4right61
2x,0 W4left3 = X4 << 3
1x,0 W4sigma1 ^= W4left3
2x,0 W4right6 = X4 unsigned>> 6
1x,0 W4sigma1 ^= W4right6
W4sigma1 = W4sigma1[1],W4sigma1[0]
4x X4 = X4 + W4sigma1
mem256[&w + 32] = X4
4x D4 = X4 + mem256[constants + 32]
wc4567 = D4
@@ -0,0 +1,41 @@
X9 = mem256[&w + 72]
W6 = mem128[&w + 48],0
4x X9right1 = X9 unsigned>> 1
4x X9left63 = X9 << 63
X9sigma0 = X9right1 ^ X9left63
4x X9right8 = X9 unsigned>> 8
X9sigma0 = X9sigma0 ^ X9right8
2x,0 W6right19 = W6 unsigned>> 19
4x X9left56 = X9 << 56
2x,0 W6left45 = W6 << 45
X9sigma0 = X9sigma0 ^ X9left56
1x,0 W6sigma1 = W6right19 ^ W6left45
4x X9right7 = X9 unsigned>> 7
2x,0 W6right61 = W6 unsigned>> 61
X9sigma0 = X9sigma0 ^ X9right7
1x,0 W6sigma1 ^= W6right61
4x X8 = X8 + X9sigma0
2x,0 W6left3 = W6 << 3
4x X8 = X8 + mem256[&w + 8]
1x,0 W6sigma1 ^= W6left3
2x,0 W6right6 = W6 unsigned>> 6
1x,0 W6sigma1 ^= W6right6
4x X8 = W6sigma1 + X8
2x,0 W8right19 = X8 unsigned>> 19
2x,0 W8left45 = X8 << 45
1x,0 W8sigma1 = W8right19 ^ W8left45
2x,0 W8right61 = X8 unsigned>> 61
1x,0 W8sigma1 ^= W8right61
2x,0 W8left3 = X8 << 3
1x,0 W8sigma1 ^= W8left3
2x,0 W8right6 = X8 unsigned>> 6
1x,0 W8sigma1 ^= W8right6
W8sigma1 = W8sigma1[1],W8sigma1[0]
4x X8 = X8 + W8sigma1
mem256[&w + 64] = X8
4x D8 = X8 + mem256[constants + 64]
wc891011 = D8
@@ -0,0 +1 @@
../m3/implementors
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,15 @@
#!/bin/sh
cpp \
| qhasm-amd64avx \
| sed 's/\<inner\>/CRYPTO_SHARED_NAMESPACE(inner)/' \
| sed 's/\<_inner\>/_CRYPTO_SHARED_NAMESPACE(inner)/' \
| sed 's/^\.p2align 5/.p2align 7/' \
| awk '{
found = 0
if (!found && $0 == "and $31,%r11") {
found = 1
$0 = "and $511,%r11"
}
print
}'
@@ -0,0 +1,10 @@
#ifndef inner_h
#define inner_h
#define inner CRYPTO_SHARED_NAMESPACE(inner)
#include "crypto_uint64.h"
extern int inner(unsigned char *,const unsigned char *,unsigned long long,const crypto_uint64 *);
#endif
File diff suppressed because it is too large Load Diff
+198
View File
@@ -0,0 +1,198 @@
#!/usr/bin/env python3
import sys
j = int(sys.argv[1])
j0 = (j+0)&15
j1 = (j+1)&15
j2 = (j+2)&15
j3 = (j+3)&15
j9 = (j+9)&15
j14 = (j+14)&15
k0 = (j+8)&15
k1 = (j+9)&15
k2 = (j+10)&15
k3 = (j+11)&15
wcin = 'wc%d%d%d%d' % (k0,k1,k2,k3)
wcout = 'wc%d%d%d%d' % (j0,j1,j2,j3)
i0 = (j+0)&7
i1 = (j+1)&7
i2 = (j+2)&7
i3 = (j+3)&7
i4 = (j+4)&7
i5 = (j+5)&7
i6 = (j+6)&7
i7 = (j+7)&7
print(' X%d = mem256[&w + %d]' % (j1,8*j1))
print(' 4x X%dright1 = X%d unsigned>> 1' % (j1,j1))
print(' r%dSigma1 = r%d>>>14' % (i4,i4))
print(' r%d += %s[0]' % (i7,wcin))
print(' ch%d = r%d' % (i7,i6))
print(' ch%d ^= r%d' % (i7,i5))
print('')
print(' 4x X%dleft63 = X%d << 63' % (j1,j1))
print(' r%d18 = r%d>>>18' % (i4,i4))
print(' ch%d &= r%d' % (i7,i4))
print(' maj%d = r%d' % (i6,i1))
print(' maj%d ^= r%d' % (i6,i0))
print('')
print(' X%dsigma0 = X%dright1 ^ X%dleft63' % (j1,j1,j1))
print(' r%d41 = r%d>>>41' % (i4,i4))
print(' r%dSigma1 ^= r%d18' % (i4,i4))
print(' ch%d ^= r%d' % (i7,i6))
print('')
print(' 4x X%dright8 = X%d unsigned>> 8' % (j1,j1))
print(' r%dSigma1 ^= r%d41' % (i4,i4))
print(' r%dSigma0 = r%d>>>28' % (i0,i0))
print(' r%d += ch%d' % (i7,i7))
print('')
print(' X%dsigma0 = X%dsigma0 ^ X%dright8' % (j1,j1,j1))
print(' r%d34 = r%d>>>34' % (i0,i0))
print(' r%d += r%dSigma1' % (i7,i4))
print(' maj%d = r%d' % (i7,i2))
print(' maj%d &= maj%d' % (i7,i6))
print('')
print(' W%d = mem128[&w + %d],0' % (j14,8*j14))
print(' 2x,0 W%dright19 = W%d unsigned>> 19' % (j14,j14))
print(' r%dSigma0 ^= r%d34' % (i0,i0))
print(' r%d39 = r%d>>>39' % (i0,i0))
print(' r%d += r%d' % (i3,i7))
print('')
print(' 4x X%dleft56 = X%d << 56' % (j1,j1))
print(' r%dSigma0 ^= r%d39' % (i0,i0))
print(' r%d += %s[1]' % (i6,wcin))
print(' r%dandr%d = r%d' % (i0,i1,i1))
print(' r%dandr%d &= r%d' % (i0,i1,i0))
print('')
print(' 2x,0 W%dleft45 = W%d << 45' % (j14,j14))
print(' r%d += r%dSigma0' % (i7,i0))
print(' maj%d ^= r%dandr%d' % (i7,i0,i1))
print(' ch%d = r%d' % (i6,i5))
print(' ch%d ^= r%d' % (i6,i4))
print('')
print(' X%dsigma0 = X%dsigma0 ^ X%dleft56' % (j1,j1,j1))
print(' 2x,0 W%dright61 = W%d unsigned>> 61' % (j14,j14))
print(' r%dSigma1 = r%d>>>14' % (i3,i3))
print(' r%d += maj%d' % (i7,i7))
print('')
print(' 4x X%dright7 = X%d unsigned>> 7' % (j1,j1))
print(' 1x,0 W%dsigma1 = W%dright19 ^ W%dleft45' % (j14,j14,j14))
print(' r%d18 = r%d>>>18' % (i3,i3))
print(' ch%d &= r%d' % (i6,i3))
print('')
print(' X%dsigma0 = X%dsigma0 ^ X%dright7' % (j1,j1,j1))
print(' r%dSigma1 ^= r%d18' % (i3,i3))
print(' r%d41 = r%d>>>41' % (i3,i3))
print(' maj%d &= r%d' % (i6,i7))
print('')
print(' 1x,0 W%dsigma1 ^= W%dright61' % (j14,j14))
print(' 4x X%d = X%d + mem256[&w + %d]' % (j0,j0,8*j9))
print(' r%dSigma1 ^= r%d41' % (i3,i3))
print(' maj%d ^= r%dandr%d' % (i6,i0,i1))
print('')
print(' 2x,0 W%dleft3 = W%d << 3' % (j14,j14))
print(' r%dSigma0 = r%d>>>28' % (i7,i7))
print(' ch%d ^= r%d' % (i6,i5))
print(' r%d += r%dSigma1' % (i6,i3))
print('')
print(' 4x X%d = X%d + X%dsigma0' % (j0,j0,j1))
print(' r%d34 = r%d>>>34' % (i7,i7))
print(' r%d += %s[2]' % (i5,wcin))
print(' r%d += ch%d' % (i6,i6))
print('')
print(' 1x,0 W%dsigma1 ^= W%dleft3' % (j14,j14))
print(' r%dSigma0 ^= r%d34' % (i7,i7))
print(' r%d39 = r%d>>>39' % (i7,i7))
print(' r%d += r%d' % (i2,i6))
print('')
print(' 2x,0 W%dright6 = W%d unsigned>> 6' % (j14,j14))
print(' r%dSigma0 ^= r%d39' % (i7,i7))
print(' r%d += maj%d' % (i6,i6))
print(' ch%d = r%d' % (i5,i4))
print(' ch%d ^= r%d' % (i5,i3))
print('')
print(' 1x,0 W%dsigma1 ^= W%dright6' % (j14,j14))
print(' r%d += r%dSigma0' % (i6,i7))
print(' r%dSigma1 = r%d>>>14' % (i2,i2))
print(' ch%d &= r%d' % (i5,i2))
print('')
print(' 4x X%d = W%dsigma1 + X%d' % (j0,j14,j0))
print(' r%d18 = r%d>>>18' % (i2,i2))
print(' r%d41 = r%d>>>41' % (i2,i2))
print(' ch%d ^= r%d' % (i5,i4))
print('')
print(' 2x,0 W%dright19 = X%d unsigned>> 19' % (j0,j0))
print(' r%dSigma1 ^= r%d18' % (i2,i2))
print(' r%dSigma0 = r%d>>>28' % (i6,i6))
print(' r%d += ch%d' % (i5,i5))
print('')
print(' 2x,0 W%dleft45 = X%d << 45' % (j0,j0))
print(' r%dSigma1 ^= r%d41' % (i2,i2))
print(' r%d34 = r%d>>>34' % (i6,i6))
print(' maj%d = r%d' % (i4,i7))
print(' maj%d ^= r%d' % (i4,i6))
print('')
print(' 2x,0 W%dright61 = X%d unsigned>> 61' % (j0,j0))
print(' 1x,0 W%dsigma1 = W%dright19 ^ W%dleft45' % (j0,j0,j0))
print(' r%dSigma0 ^= r%d34' % (i6,i6))
print(' r%d39 = r%d>>>39' % (i6,i6))
print('')
print(' 2x,0 W%dleft3 = X%d << 3' % (j0,j0))
print(' 1x,0 W%dsigma1 ^= W%dright61' % (j0,j0))
print(' r%dSigma0 ^= r%d39' % (i6,i6))
print(' r%d += r%dSigma1' % (i5,i2))
print('')
print(' 2x,0 W%dright6 = X%d unsigned>> 6' % (j0,j0))
print(' 1x,0 W%dsigma1 ^= W%dleft3' % (j0,j0))
print(' r%d += r%d' % (i1,i5))
print(' r%dandr%d = r%d' % (i6,i7,i7))
print(' r%dandr%d &= r%d' % (i6,i7,i6))
print('')
print(' 1x,0 W%dsigma1 ^= W%dright6' % (j0,j0))
print(' r%dSigma1 = r%d>>>14' % (i1,i1))
print(' r%d += r%dSigma0' % (i5,i6))
print(' maj%d = r%d' % (i5,i0))
print(' maj%d &= maj%d' % (i5,i4))
print('')
print(' W%dsigma1 = W%dsigma1[1],W%dsigma1[0]' % (j0,j0,j0))
print(' maj%d ^= r%dandr%d' % (i5,i6,i7))
print(' ch%d = r%d' % (i4,i3))
print(' ch%d ^= r%d' % (i4,i2))
print('')
print(' r%d += maj%d' % (i5,i5))
print(' ch%d &= r%d' % (i4,i1))
print(' r%d18 = r%d>>>18' % (i1,i1))
print('')
print(' maj%d &= r%d' % (i4,i5))
print(' ch%d ^= r%d' % (i4,i3))
print(' r%d += %s[3]' % (i4,wcin))
print(' r%dSigma1 ^= r%d18' % (i1,i1))
print('')
print(' r%d41 = r%d>>>41' % (i1,i1))
print(' 4x X%d = X%d + W%dsigma1' % (j0,j0,j0))
if j0 == 0:
print(' mem256[&w + 128] = X%d' % (j0))
print(' mem256[&w + %d] = X%d' % (8*j0,j0))
print(' r%d += ch%d' % (i4,i4))
print(' maj%d ^= r%dandr%d' % (i4,i6,i7))
print('')
print(' r%dSigma0 = r%d>>>28' % (i5,i5))
print(' 4x D%d = X%d + mem256[constants + %d]' % (j0,j0,8*j0))
print(' %s = D%d' % (wcout,j0))
print(' r%d34 = r%d>>>34' % (i5,i5))
print(' r%dSigma1 ^= r%d41' % (i1,i1))
print('')
print(' r%d += r%dSigma1' % (i4,i1))
print(' r%dSigma0 ^= r%d34' % (i5,i5))
print(' r%d39 = r%d>>>39' % (i5,i5))
print('')
print(' r%d += r%d' % (i0,i4))
print(' r%d += maj%d' % (i4,i4))
print(' r%dSigma0 ^= r%d39' % (i5,i5))
print('')
print(' r%d += r%dSigma0' % (i4,i5))
@@ -0,0 +1,167 @@
X1 = mem256[&w + 8]
4x X1right1 = X1 unsigned>> 1
r4Sigma1 = r4>>>14
r7 += wc891011[0]
ch7 = r6
ch7 ^= r5
4x X1left63 = X1 << 63
r418 = r4>>>18
ch7 &= r4
maj6 = r1
maj6 ^= r0
X1sigma0 = X1right1 ^ X1left63
r441 = r4>>>41
r4Sigma1 ^= r418
ch7 ^= r6
4x X1right8 = X1 unsigned>> 8
r4Sigma1 ^= r441
r0Sigma0 = r0>>>28
r7 += ch7
X1sigma0 = X1sigma0 ^ X1right8
r034 = r0>>>34
r7 += r4Sigma1
maj7 = r2
maj7 &= maj6
W14 = mem128[&w + 112],0
2x,0 W14right19 = W14 unsigned>> 19
r0Sigma0 ^= r034
r039 = r0>>>39
r3 += r7
4x X1left56 = X1 << 56
r0Sigma0 ^= r039
r6 += wc891011[1]
r0andr1 = r1
r0andr1 &= r0
2x,0 W14left45 = W14 << 45
r7 += r0Sigma0
maj7 ^= r0andr1
ch6 = r5
ch6 ^= r4
X1sigma0 = X1sigma0 ^ X1left56
2x,0 W14right61 = W14 unsigned>> 61
r3Sigma1 = r3>>>14
r7 += maj7
4x X1right7 = X1 unsigned>> 7
1x,0 W14sigma1 = W14right19 ^ W14left45
r318 = r3>>>18
ch6 &= r3
X1sigma0 = X1sigma0 ^ X1right7
r3Sigma1 ^= r318
r341 = r3>>>41
maj6 &= r7
1x,0 W14sigma1 ^= W14right61
4x X0 = X0 + mem256[&w + 72]
r3Sigma1 ^= r341
maj6 ^= r0andr1
2x,0 W14left3 = W14 << 3
r7Sigma0 = r7>>>28
ch6 ^= r5
r6 += r3Sigma1
4x X0 = X0 + X1sigma0
r734 = r7>>>34
r5 += wc891011[2]
r6 += ch6
1x,0 W14sigma1 ^= W14left3
r7Sigma0 ^= r734
r739 = r7>>>39
r2 += r6
2x,0 W14right6 = W14 unsigned>> 6
r7Sigma0 ^= r739
r6 += maj6
ch5 = r4
ch5 ^= r3
1x,0 W14sigma1 ^= W14right6
r6 += r7Sigma0
r2Sigma1 = r2>>>14
ch5 &= r2
4x X0 = W14sigma1 + X0
r218 = r2>>>18
r241 = r2>>>41
ch5 ^= r4
2x,0 W0right19 = X0 unsigned>> 19
r2Sigma1 ^= r218
r6Sigma0 = r6>>>28
r5 += ch5
2x,0 W0left45 = X0 << 45
r2Sigma1 ^= r241
r634 = r6>>>34
maj4 = r7
maj4 ^= r6
2x,0 W0right61 = X0 unsigned>> 61
1x,0 W0sigma1 = W0right19 ^ W0left45
r6Sigma0 ^= r634
r639 = r6>>>39
2x,0 W0left3 = X0 << 3
1x,0 W0sigma1 ^= W0right61
r6Sigma0 ^= r639
r5 += r2Sigma1
2x,0 W0right6 = X0 unsigned>> 6
1x,0 W0sigma1 ^= W0left3
r1 += r5
r6andr7 = r7
r6andr7 &= r6
1x,0 W0sigma1 ^= W0right6
r1Sigma1 = r1>>>14
r5 += r6Sigma0
maj5 = r0
maj5 &= maj4
W0sigma1 = W0sigma1[1],W0sigma1[0]
maj5 ^= r6andr7
ch4 = r3
ch4 ^= r2
r5 += maj5
ch4 &= r1
r118 = r1>>>18
maj4 &= r5
ch4 ^= r3
r4 += wc891011[3]
r1Sigma1 ^= r118
r141 = r1>>>41
4x X0 = X0 + W0sigma1
mem256[&w + 128] = X0
mem256[&w + 0] = X0
r4 += ch4
maj4 ^= r6andr7
r5Sigma0 = r5>>>28
4x D0 = X0 + mem256[constants + 0]
wc0123 = D0
r534 = r5>>>34
r1Sigma1 ^= r141
r4 += r1Sigma1
r5Sigma0 ^= r534
r539 = r5>>>39
r0 += r4
r4 += maj4
r5Sigma0 ^= r539
r4 += r5Sigma0
@@ -0,0 +1,166 @@
X13 = mem256[&w + 104]
4x X13right1 = X13 unsigned>> 1
r0Sigma1 = r0>>>14
r3 += wc4567[0]
ch3 = r2
ch3 ^= r1
4x X13left63 = X13 << 63
r018 = r0>>>18
ch3 &= r0
maj2 = r5
maj2 ^= r4
X13sigma0 = X13right1 ^ X13left63
r041 = r0>>>41
r0Sigma1 ^= r018
ch3 ^= r2
4x X13right8 = X13 unsigned>> 8
r0Sigma1 ^= r041
r4Sigma0 = r4>>>28
r3 += ch3
X13sigma0 = X13sigma0 ^ X13right8
r434 = r4>>>34
r3 += r0Sigma1
maj3 = r6
maj3 &= maj2
W10 = mem128[&w + 80],0
2x,0 W10right19 = W10 unsigned>> 19
r4Sigma0 ^= r434
r439 = r4>>>39
r7 += r3
4x X13left56 = X13 << 56
r4Sigma0 ^= r439
r2 += wc4567[1]
r4andr5 = r5
r4andr5 &= r4
2x,0 W10left45 = W10 << 45
r3 += r4Sigma0
maj3 ^= r4andr5
ch2 = r1
ch2 ^= r0
X13sigma0 = X13sigma0 ^ X13left56
2x,0 W10right61 = W10 unsigned>> 61
r7Sigma1 = r7>>>14
r3 += maj3
4x X13right7 = X13 unsigned>> 7
1x,0 W10sigma1 = W10right19 ^ W10left45
r718 = r7>>>18
ch2 &= r7
X13sigma0 = X13sigma0 ^ X13right7
r7Sigma1 ^= r718
r741 = r7>>>41
maj2 &= r3
1x,0 W10sigma1 ^= W10right61
4x X12 = X12 + mem256[&w + 40]
r7Sigma1 ^= r741
maj2 ^= r4andr5
2x,0 W10left3 = W10 << 3
r3Sigma0 = r3>>>28
ch2 ^= r1
r2 += r7Sigma1
4x X12 = X12 + X13sigma0
r334 = r3>>>34
r1 += wc4567[2]
r2 += ch2
1x,0 W10sigma1 ^= W10left3
r3Sigma0 ^= r334
r339 = r3>>>39
r6 += r2
2x,0 W10right6 = W10 unsigned>> 6
r3Sigma0 ^= r339
r2 += maj2
ch1 = r0
ch1 ^= r7
1x,0 W10sigma1 ^= W10right6
r2 += r3Sigma0
r6Sigma1 = r6>>>14
ch1 &= r6
4x X12 = W10sigma1 + X12
r618 = r6>>>18
r641 = r6>>>41
ch1 ^= r0
2x,0 W12right19 = X12 unsigned>> 19
r6Sigma1 ^= r618
r2Sigma0 = r2>>>28
r1 += ch1
2x,0 W12left45 = X12 << 45
r6Sigma1 ^= r641
r234 = r2>>>34
maj0 = r3
maj0 ^= r2
2x,0 W12right61 = X12 unsigned>> 61
1x,0 W12sigma1 = W12right19 ^ W12left45
r2Sigma0 ^= r234
r239 = r2>>>39
2x,0 W12left3 = X12 << 3
1x,0 W12sigma1 ^= W12right61
r2Sigma0 ^= r239
r1 += r6Sigma1
2x,0 W12right6 = X12 unsigned>> 6
1x,0 W12sigma1 ^= W12left3
r5 += r1
r2andr3 = r3
r2andr3 &= r2
1x,0 W12sigma1 ^= W12right6
r5Sigma1 = r5>>>14
r1 += r2Sigma0
maj1 = r4
maj1 &= maj0
W12sigma1 = W12sigma1[1],W12sigma1[0]
maj1 ^= r2andr3
ch0 = r7
ch0 ^= r6
r1 += maj1
ch0 &= r5
r518 = r5>>>18
maj0 &= r1
ch0 ^= r7
r0 += wc4567[3]
r5Sigma1 ^= r518
r541 = r5>>>41
4x X12 = X12 + W12sigma1
mem256[&w + 96] = X12
r0 += ch0
maj0 ^= r2andr3
r1Sigma0 = r1>>>28
4x D12 = X12 + mem256[constants + 96]
wc12131415 = D12
r134 = r1>>>34
r5Sigma1 ^= r541
r0 += r5Sigma1
r1Sigma0 ^= r134
r139 = r1>>>39
r4 += r0
r0 += maj0
r1Sigma0 ^= r139
r0 += r1Sigma0
@@ -0,0 +1,166 @@
X5 = mem256[&w + 40]
4x X5right1 = X5 unsigned>> 1
r0Sigma1 = r0>>>14
r3 += wc12131415[0]
ch3 = r2
ch3 ^= r1
4x X5left63 = X5 << 63
r018 = r0>>>18
ch3 &= r0
maj2 = r5
maj2 ^= r4
X5sigma0 = X5right1 ^ X5left63
r041 = r0>>>41
r0Sigma1 ^= r018
ch3 ^= r2
4x X5right8 = X5 unsigned>> 8
r0Sigma1 ^= r041
r4Sigma0 = r4>>>28
r3 += ch3
X5sigma0 = X5sigma0 ^ X5right8
r434 = r4>>>34
r3 += r0Sigma1
maj3 = r6
maj3 &= maj2
W2 = mem128[&w + 16],0
2x,0 W2right19 = W2 unsigned>> 19
r4Sigma0 ^= r434
r439 = r4>>>39
r7 += r3
4x X5left56 = X5 << 56
r4Sigma0 ^= r439
r2 += wc12131415[1]
r4andr5 = r5
r4andr5 &= r4
2x,0 W2left45 = W2 << 45
r3 += r4Sigma0
maj3 ^= r4andr5
ch2 = r1
ch2 ^= r0
X5sigma0 = X5sigma0 ^ X5left56
2x,0 W2right61 = W2 unsigned>> 61
r7Sigma1 = r7>>>14
r3 += maj3
4x X5right7 = X5 unsigned>> 7
1x,0 W2sigma1 = W2right19 ^ W2left45
r718 = r7>>>18
ch2 &= r7
X5sigma0 = X5sigma0 ^ X5right7
r7Sigma1 ^= r718
r741 = r7>>>41
maj2 &= r3
1x,0 W2sigma1 ^= W2right61
4x X4 = X4 + mem256[&w + 104]
r7Sigma1 ^= r741
maj2 ^= r4andr5
2x,0 W2left3 = W2 << 3
r3Sigma0 = r3>>>28
ch2 ^= r1
r2 += r7Sigma1
4x X4 = X4 + X5sigma0
r334 = r3>>>34
r1 += wc12131415[2]
r2 += ch2
1x,0 W2sigma1 ^= W2left3
r3Sigma0 ^= r334
r339 = r3>>>39
r6 += r2
2x,0 W2right6 = W2 unsigned>> 6
r3Sigma0 ^= r339
r2 += maj2
ch1 = r0
ch1 ^= r7
1x,0 W2sigma1 ^= W2right6
r2 += r3Sigma0
r6Sigma1 = r6>>>14
ch1 &= r6
4x X4 = W2sigma1 + X4
r618 = r6>>>18
r641 = r6>>>41
ch1 ^= r0
2x,0 W4right19 = X4 unsigned>> 19
r6Sigma1 ^= r618
r2Sigma0 = r2>>>28
r1 += ch1
2x,0 W4left45 = X4 << 45
r6Sigma1 ^= r641
r234 = r2>>>34
maj0 = r3
maj0 ^= r2
2x,0 W4right61 = X4 unsigned>> 61
1x,0 W4sigma1 = W4right19 ^ W4left45
r2Sigma0 ^= r234
r239 = r2>>>39
2x,0 W4left3 = X4 << 3
1x,0 W4sigma1 ^= W4right61
r2Sigma0 ^= r239
r1 += r6Sigma1
2x,0 W4right6 = X4 unsigned>> 6
1x,0 W4sigma1 ^= W4left3
r5 += r1
r2andr3 = r3
r2andr3 &= r2
1x,0 W4sigma1 ^= W4right6
r5Sigma1 = r5>>>14
r1 += r2Sigma0
maj1 = r4
maj1 &= maj0
W4sigma1 = W4sigma1[1],W4sigma1[0]
maj1 ^= r2andr3
ch0 = r7
ch0 ^= r6
r1 += maj1
ch0 &= r5
r518 = r5>>>18
maj0 &= r1
ch0 ^= r7
r0 += wc12131415[3]
r5Sigma1 ^= r518
r541 = r5>>>41
4x X4 = X4 + W4sigma1
mem256[&w + 32] = X4
r0 += ch0
maj0 ^= r2andr3
r1Sigma0 = r1>>>28
4x D4 = X4 + mem256[constants + 32]
wc4567 = D4
r134 = r1>>>34
r5Sigma1 ^= r541
r0 += r5Sigma1
r1Sigma0 ^= r134
r139 = r1>>>39
r4 += r0
r0 += maj0
r1Sigma0 ^= r139
r0 += r1Sigma0
@@ -0,0 +1,166 @@
X9 = mem256[&w + 72]
4x X9right1 = X9 unsigned>> 1
r4Sigma1 = r4>>>14
r7 += wc0123[0]
ch7 = r6
ch7 ^= r5
4x X9left63 = X9 << 63
r418 = r4>>>18
ch7 &= r4
maj6 = r1
maj6 ^= r0
X9sigma0 = X9right1 ^ X9left63
r441 = r4>>>41
r4Sigma1 ^= r418
ch7 ^= r6
4x X9right8 = X9 unsigned>> 8
r4Sigma1 ^= r441
r0Sigma0 = r0>>>28
r7 += ch7
X9sigma0 = X9sigma0 ^ X9right8
r034 = r0>>>34
r7 += r4Sigma1
maj7 = r2
maj7 &= maj6
W6 = mem128[&w + 48],0
2x,0 W6right19 = W6 unsigned>> 19
r0Sigma0 ^= r034
r039 = r0>>>39
r3 += r7
4x X9left56 = X9 << 56
r0Sigma0 ^= r039
r6 += wc0123[1]
r0andr1 = r1
r0andr1 &= r0
2x,0 W6left45 = W6 << 45
r7 += r0Sigma0
maj7 ^= r0andr1
ch6 = r5
ch6 ^= r4
X9sigma0 = X9sigma0 ^ X9left56
2x,0 W6right61 = W6 unsigned>> 61
r3Sigma1 = r3>>>14
r7 += maj7
4x X9right7 = X9 unsigned>> 7
1x,0 W6sigma1 = W6right19 ^ W6left45
r318 = r3>>>18
ch6 &= r3
X9sigma0 = X9sigma0 ^ X9right7
r3Sigma1 ^= r318
r341 = r3>>>41
maj6 &= r7
1x,0 W6sigma1 ^= W6right61
4x X8 = X8 + mem256[&w + 8]
r3Sigma1 ^= r341
maj6 ^= r0andr1
2x,0 W6left3 = W6 << 3
r7Sigma0 = r7>>>28
ch6 ^= r5
r6 += r3Sigma1
4x X8 = X8 + X9sigma0
r734 = r7>>>34
r5 += wc0123[2]
r6 += ch6
1x,0 W6sigma1 ^= W6left3
r7Sigma0 ^= r734
r739 = r7>>>39
r2 += r6
2x,0 W6right6 = W6 unsigned>> 6
r7Sigma0 ^= r739
r6 += maj6
ch5 = r4
ch5 ^= r3
1x,0 W6sigma1 ^= W6right6
r6 += r7Sigma0
r2Sigma1 = r2>>>14
ch5 &= r2
4x X8 = W6sigma1 + X8
r218 = r2>>>18
r241 = r2>>>41
ch5 ^= r4
2x,0 W8right19 = X8 unsigned>> 19
r2Sigma1 ^= r218
r6Sigma0 = r6>>>28
r5 += ch5
2x,0 W8left45 = X8 << 45
r2Sigma1 ^= r241
r634 = r6>>>34
maj4 = r7
maj4 ^= r6
2x,0 W8right61 = X8 unsigned>> 61
1x,0 W8sigma1 = W8right19 ^ W8left45
r6Sigma0 ^= r634
r639 = r6>>>39
2x,0 W8left3 = X8 << 3
1x,0 W8sigma1 ^= W8right61
r6Sigma0 ^= r639
r5 += r2Sigma1
2x,0 W8right6 = X8 unsigned>> 6
1x,0 W8sigma1 ^= W8left3
r1 += r5
r6andr7 = r7
r6andr7 &= r6
1x,0 W8sigma1 ^= W8right6
r1Sigma1 = r1>>>14
r5 += r6Sigma0
maj5 = r0
maj5 &= maj4
W8sigma1 = W8sigma1[1],W8sigma1[0]
maj5 ^= r6andr7
ch4 = r3
ch4 ^= r2
r5 += maj5
ch4 &= r1
r118 = r1>>>18
maj4 &= r5
ch4 ^= r3
r4 += wc0123[3]
r1Sigma1 ^= r118
r141 = r1>>>41
4x X8 = X8 + W8sigma1
mem256[&w + 64] = X8
r4 += ch4
maj4 ^= r6andr7
r5Sigma0 = r5>>>28
4x D8 = X8 + mem256[constants + 64]
wc891011 = D8
r534 = r5>>>34
r1Sigma1 ^= r141
r4 += r1Sigma1
r5Sigma0 ^= r534
r539 = r5>>>39
r0 += r4
r4 += maj4
r5Sigma0 ^= r539
r4 += r5Sigma0
+90
View File
@@ -0,0 +1,90 @@
#!/usr/bin/env python3
import sys
i = 0
for doubleround in range(8):
i0 = (i+0)&7
i1 = (i+1)&7
i2 = (i+2)&7
i3 = (i+3)&7
i4 = (i+4)&7
i5 = (i+5)&7
i6 = (i+6)&7
i7 = (i+7)&7
round = 2*doubleround
round4 = round&~3
loadarray = 'wc%d%d%d%d' % (round4,round4+1,round4+2,round4+3)
i0load = '%s[%d]' % (loadarray,(round+0)&3)
i1load = '%s[%d]' % (loadarray,(round+1)&3)
i -= 2
if len(sys.argv) > 1:
if sys.argv[1] != str(doubleround):
continue
print(' r%d += %s' % (i7,i0load))
print(' r%dSigma1 = r%d>>>14' % (i4,i4))
print(' ch%d = r%d' % (i7,i6))
print(' r%d18 = r%d>>>18' % (i4,i4))
print(' ch%d ^= r%d' % (i7,i5))
print(' r%d41 = r%d>>>41' % (i4,i4))
print(' r%dSigma1 ^= r%d18' % (i4,i4))
print(' ch%d &= r%d' % (i7,i4))
print(' r%dSigma0 = r%d>>>28' % (i0,i0))
print(' r%dSigma1 ^= r%d41' % (i4,i4))
print(' ch%d ^= r%d' % (i7,i6))
print(' r%d34 = r%d>>>34' % (i0,i0))
print(' maj%d = r%d' % (i6,i1))
print(' maj%d ^= r%d' % (i6,i0))
print(' r%dSigma0 ^= r%d34' % (i0,i0))
print(' r%d += ch%d' % (i7,i7))
print(' r%dandr%d = r%d' % (i0,i1,i1))
print(' r%d39 = r%d>>>39' % (i0,i0))
print(' r%dandr%d &= r%d' % (i0,i1,i0))
print(' r%dSigma0 ^= r%d39' % (i0,i0))
print(' r%d += r%dSigma1' % (i7,i4))
print(' maj%d = r%d' % (i7,i2))
print(' r%d += %s' % (i6,i1load))
print(' maj%d &= maj%d' % (i7,i6))
print(' r%d += r%d' % (i3,i7))
print(' r%d += r%dSigma0' % (i7,i0))
print(' ch%d = r%d' % (i6,i5))
print(' maj%d ^= r%dandr%d' % (i7,i0,i1))
print(' ch%d ^= r%d' % (i6,i4))
print(' r%dSigma1 = r%d>>>14' % (i3,i3))
print(' r%d += maj%d' % (i7,i7))
print(' ch%d &= r%d' % (i6,i3))
print(' r%d18 = r%d>>>18' % (i3,i3))
print(' r%dSigma1 ^= r%d18' % (i3,i3))
print(' maj%d &= r%d' % (i6,i7))
print(' ch%d ^= r%d' % (i6,i5))
print(' r%d41 = r%d>>>41' % (i3,i3))
print(' r%dSigma1 ^= r%d41' % (i3,i3))
print(' r%dSigma0 = r%d>>>28' % (i7,i7))
print(' maj%d ^= r%dandr%d' % (i6,i0,i1))
print(' r%d += ch%d' % (i6,i6))
print(' r%d += r%dSigma1' % (i6,i3))
print(' r%d34 = r%d>>>34' % (i7,i7))
print(' r%dSigma0 ^= r%d34' % (i7,i7))
print(' r%d += r%d' % (i2,i6))
print(' r%d += maj%d' % (i6,i6))
print(' r%d39 = r%d>>>39' % (i7,i7))
print(' r%dSigma0 ^= r%d39' % (i7,i7))
print(' r%d += r%dSigma0' % (i6,i7))
@@ -0,0 +1,49 @@
r7 += wc0123[0]
r4Sigma1 = r4>>>14
ch7 = r6
r418 = r4>>>18
ch7 ^= r5
r441 = r4>>>41
r4Sigma1 ^= r418
ch7 &= r4
r0Sigma0 = r0>>>28
r4Sigma1 ^= r441
ch7 ^= r6
r034 = r0>>>34
maj6 = r1
maj6 ^= r0
r0Sigma0 ^= r034
r7 += ch7
r0andr1 = r1
r039 = r0>>>39
r0andr1 &= r0
r0Sigma0 ^= r039
r7 += r4Sigma1
maj7 = r2
r6 += wc0123[1]
maj7 &= maj6
r3 += r7
r7 += r0Sigma0
ch6 = r5
maj7 ^= r0andr1
ch6 ^= r4
r3Sigma1 = r3>>>14
r7 += maj7
ch6 &= r3
r318 = r3>>>18
r3Sigma1 ^= r318
maj6 &= r7
ch6 ^= r5
r341 = r3>>>41
r3Sigma1 ^= r341
r7Sigma0 = r7>>>28
maj6 ^= r0andr1
r6 += ch6
r6 += r3Sigma1
r734 = r7>>>34
r7Sigma0 ^= r734
r2 += r6
r6 += maj6
r739 = r7>>>39
r7Sigma0 ^= r739
r6 += r7Sigma0
@@ -0,0 +1,49 @@
r5 += wc891011[2]
r2Sigma1 = r2>>>14
ch5 = r4
r218 = r2>>>18
ch5 ^= r3
r241 = r2>>>41
r2Sigma1 ^= r218
ch5 &= r2
r6Sigma0 = r6>>>28
r2Sigma1 ^= r241
ch5 ^= r4
r634 = r6>>>34
maj4 = r7
maj4 ^= r6
r6Sigma0 ^= r634
r5 += ch5
r6andr7 = r7
r639 = r6>>>39
r6andr7 &= r6
r6Sigma0 ^= r639
r5 += r2Sigma1
maj5 = r0
r4 += wc891011[3]
maj5 &= maj4
r1 += r5
r5 += r6Sigma0
ch4 = r3
maj5 ^= r6andr7
ch4 ^= r2
r1Sigma1 = r1>>>14
r5 += maj5
ch4 &= r1
r118 = r1>>>18
r1Sigma1 ^= r118
maj4 &= r5
ch4 ^= r3
r141 = r1>>>41
r1Sigma1 ^= r141
r5Sigma0 = r5>>>28
maj4 ^= r6andr7
r4 += ch4
r4 += r1Sigma1
r534 = r5>>>34
r5Sigma0 ^= r534
r0 += r4
r4 += maj4
r539 = r5>>>39
r5Sigma0 ^= r539
r4 += r5Sigma0
@@ -0,0 +1,49 @@
r3 += wc12131415[0]
r0Sigma1 = r0>>>14
ch3 = r2
r018 = r0>>>18
ch3 ^= r1
r041 = r0>>>41
r0Sigma1 ^= r018
ch3 &= r0
r4Sigma0 = r4>>>28
r0Sigma1 ^= r041
ch3 ^= r2
r434 = r4>>>34
maj2 = r5
maj2 ^= r4
r4Sigma0 ^= r434
r3 += ch3
r4andr5 = r5
r439 = r4>>>39
r4andr5 &= r4
r4Sigma0 ^= r439
r3 += r0Sigma1
maj3 = r6
r2 += wc12131415[1]
maj3 &= maj2
r7 += r3
r3 += r4Sigma0
ch2 = r1
maj3 ^= r4andr5
ch2 ^= r0
r7Sigma1 = r7>>>14
r3 += maj3
ch2 &= r7
r718 = r7>>>18
r7Sigma1 ^= r718
maj2 &= r3
ch2 ^= r1
r741 = r7>>>41
r7Sigma1 ^= r741
r3Sigma0 = r3>>>28
maj2 ^= r4andr5
r2 += ch2
r2 += r7Sigma1
r334 = r3>>>34
r3Sigma0 ^= r334
r6 += r2
r2 += maj2
r339 = r3>>>39
r3Sigma0 ^= r339
r2 += r3Sigma0
@@ -0,0 +1,49 @@
r1 += wc12131415[2]
r6Sigma1 = r6>>>14
ch1 = r0
r618 = r6>>>18
ch1 ^= r7
r641 = r6>>>41
r6Sigma1 ^= r618
ch1 &= r6
r2Sigma0 = r2>>>28
r6Sigma1 ^= r641
ch1 ^= r0
r234 = r2>>>34
maj0 = r3
maj0 ^= r2
r2Sigma0 ^= r234
r1 += ch1
r2andr3 = r3
r239 = r2>>>39
r2andr3 &= r2
r2Sigma0 ^= r239
r1 += r6Sigma1
maj1 = r4
r0 += wc12131415[3]
maj1 &= maj0
r5 += r1
r1 += r2Sigma0
ch0 = r7
maj1 ^= r2andr3
ch0 ^= r6
r5Sigma1 = r5>>>14
r1 += maj1
ch0 &= r5
r518 = r5>>>18
r5Sigma1 ^= r518
maj0 &= r1
ch0 ^= r7
r541 = r5>>>41
r5Sigma1 ^= r541
r1Sigma0 = r1>>>28
maj0 ^= r2andr3
r0 += ch0
r0 += r5Sigma1
r134 = r1>>>34
r1Sigma0 ^= r134
r4 += r0
r0 += maj0
r139 = r1>>>39
r1Sigma0 ^= r139
r0 += r1Sigma0
@@ -0,0 +1,49 @@
r5 += wc0123[2]
r2Sigma1 = r2>>>14
ch5 = r4
r218 = r2>>>18
ch5 ^= r3
r241 = r2>>>41
r2Sigma1 ^= r218
ch5 &= r2
r6Sigma0 = r6>>>28
r2Sigma1 ^= r241
ch5 ^= r4
r634 = r6>>>34
maj4 = r7
maj4 ^= r6
r6Sigma0 ^= r634
r5 += ch5
r6andr7 = r7
r639 = r6>>>39
r6andr7 &= r6
r6Sigma0 ^= r639
r5 += r2Sigma1
maj5 = r0
r4 += wc0123[3]
maj5 &= maj4
r1 += r5
r5 += r6Sigma0
ch4 = r3
maj5 ^= r6andr7
ch4 ^= r2
r1Sigma1 = r1>>>14
r5 += maj5
ch4 &= r1
r118 = r1>>>18
r1Sigma1 ^= r118
maj4 &= r5
ch4 ^= r3
r141 = r1>>>41
r1Sigma1 ^= r141
r5Sigma0 = r5>>>28
maj4 ^= r6andr7
r4 += ch4
r4 += r1Sigma1
r534 = r5>>>34
r5Sigma0 ^= r534
r0 += r4
r4 += maj4
r539 = r5>>>39
r5Sigma0 ^= r539
r4 += r5Sigma0
@@ -0,0 +1,49 @@
r3 += wc4567[0]
r0Sigma1 = r0>>>14
ch3 = r2
r018 = r0>>>18
ch3 ^= r1
r041 = r0>>>41
r0Sigma1 ^= r018
ch3 &= r0
r4Sigma0 = r4>>>28
r0Sigma1 ^= r041
ch3 ^= r2
r434 = r4>>>34
maj2 = r5
maj2 ^= r4
r4Sigma0 ^= r434
r3 += ch3
r4andr5 = r5
r439 = r4>>>39
r4andr5 &= r4
r4Sigma0 ^= r439
r3 += r0Sigma1
maj3 = r6
r2 += wc4567[1]
maj3 &= maj2
r7 += r3
r3 += r4Sigma0
ch2 = r1
maj3 ^= r4andr5
ch2 ^= r0
r7Sigma1 = r7>>>14
r3 += maj3
ch2 &= r7
r718 = r7>>>18
r7Sigma1 ^= r718
maj2 &= r3
ch2 ^= r1
r741 = r7>>>41
r7Sigma1 ^= r741
r3Sigma0 = r3>>>28
maj2 ^= r4andr5
r2 += ch2
r2 += r7Sigma1
r334 = r3>>>34
r3Sigma0 ^= r334
r6 += r2
r2 += maj2
r339 = r3>>>39
r3Sigma0 ^= r339
r2 += r3Sigma0
@@ -0,0 +1,49 @@
r1 += wc4567[2]
r6Sigma1 = r6>>>14
ch1 = r0
r618 = r6>>>18
ch1 ^= r7
r641 = r6>>>41
r6Sigma1 ^= r618
ch1 &= r6
r2Sigma0 = r2>>>28
r6Sigma1 ^= r641
ch1 ^= r0
r234 = r2>>>34
maj0 = r3
maj0 ^= r2
r2Sigma0 ^= r234
r1 += ch1
r2andr3 = r3
r239 = r2>>>39
r2andr3 &= r2
r2Sigma0 ^= r239
r1 += r6Sigma1
maj1 = r4
r0 += wc4567[3]
maj1 &= maj0
r5 += r1
r1 += r2Sigma0
ch0 = r7
maj1 ^= r2andr3
ch0 ^= r6
r5Sigma1 = r5>>>14
r1 += maj1
ch0 &= r5
r518 = r5>>>18
r5Sigma1 ^= r518
maj0 &= r1
ch0 ^= r7
r541 = r5>>>41
r5Sigma1 ^= r541
r1Sigma0 = r1>>>28
maj0 ^= r2andr3
r0 += ch0
r0 += r5Sigma1
r134 = r1>>>34
r1Sigma0 ^= r134
r4 += r0
r0 += maj0
r139 = r1>>>39
r1Sigma0 ^= r139
r0 += r1Sigma0
@@ -0,0 +1,49 @@
r7 += wc891011[0]
r4Sigma1 = r4>>>14
ch7 = r6
r418 = r4>>>18
ch7 ^= r5
r441 = r4>>>41
r4Sigma1 ^= r418
ch7 &= r4
r0Sigma0 = r0>>>28
r4Sigma1 ^= r441
ch7 ^= r6
r034 = r0>>>34
maj6 = r1
maj6 ^= r0
r0Sigma0 ^= r034
r7 += ch7
r0andr1 = r1
r039 = r0>>>39
r0andr1 &= r0
r0Sigma0 ^= r039
r7 += r4Sigma1
maj7 = r2
r6 += wc891011[1]
maj7 &= maj6
r3 += r7
r7 += r0Sigma0
ch6 = r5
maj7 ^= r0andr1
ch6 ^= r4
r3Sigma1 = r3>>>14
r7 += maj7
ch6 &= r3
r318 = r3>>>18
r3Sigma1 ^= r318
maj6 &= r7
ch6 ^= r5
r341 = r3>>>41
r3Sigma1 ^= r341
r7Sigma0 = r7>>>28
maj6 ^= r0andr1
r6 += ch6
r6 += r3Sigma1
r734 = r7>>>34
r7Sigma0 ^= r734
r2 += r6
r6 += maj6
r739 = r7>>>39
r7Sigma0 ^= r739
r6 += r7Sigma0
@@ -0,0 +1 @@
../m3/api.h
@@ -0,0 +1,218 @@
#include "crypto_hashblocks.h"
typedef unsigned long long uint64;
static uint64 load_bigendian(const unsigned char *x)
{
return
(uint64) (x[7]) \
| (((uint64) (x[6])) << 8) \
| (((uint64) (x[5])) << 16) \
| (((uint64) (x[4])) << 24) \
| (((uint64) (x[3])) << 32) \
| (((uint64) (x[2])) << 40) \
| (((uint64) (x[1])) << 48) \
| (((uint64) (x[0])) << 56)
;
}
static void store_bigendian(unsigned char *x,uint64 u)
{
x[7] = u; u >>= 8;
x[6] = u; u >>= 8;
x[5] = u; u >>= 8;
x[4] = u; u >>= 8;
x[3] = u; u >>= 8;
x[2] = u; u >>= 8;
x[1] = u; u >>= 8;
x[0] = u;
}
#define SHR(x,c) ((x) >> (c))
#define ROTR(x,c) (((x) >> (c)) | ((x) << (64 - (c))))
#define sigma0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x,7))
#define sigma1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x,6))
#define M(w0,w14,w9,w1) w0 = sigma1(w14) + w9 + sigma0(w1) + w0;
static void expand(uint64 *w)
{
M(w[0] ,w[14],w[9] ,w[1] )
M(w[1] ,w[15],w[10],w[2] )
M(w[2] ,w[0] ,w[11],w[3] )
M(w[3] ,w[1] ,w[12],w[4] )
M(w[4] ,w[2] ,w[13],w[5] )
M(w[5] ,w[3] ,w[14],w[6] )
M(w[6] ,w[4] ,w[15],w[7] )
M(w[7] ,w[5] ,w[0] ,w[8] )
M(w[8] ,w[6] ,w[1] ,w[9] )
M(w[9] ,w[7] ,w[2] ,w[10])
M(w[10],w[8] ,w[3] ,w[11])
M(w[11],w[9] ,w[4] ,w[12])
M(w[12],w[10],w[5] ,w[13])
M(w[13],w[11],w[6] ,w[14])
M(w[14],w[12],w[7] ,w[15])
M(w[15],w[13],w[8] ,w[0] )
}
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
#define Maj(x,y,z) ((x & (y ^ z)) ^ (y & z))
#define Sigma0(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
#define Sigma1(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
#define F(r0,r1,r2,r3,r4,r5,r6,r7,w,k) \
r7 += Sigma1(r4) + Ch(r4,r5,r6) + k + w; \
r3 += r7; \
r7 += Sigma0(r0) + Maj(r0,r1,r2);
static void handle(uint64 *r,const uint64 *w,const uint64 *c)
{
F(r[0],r[1],r[2],r[3],r[4],r[5],r[6],r[7],w[0] ,c[0])
F(r[7],r[0],r[1],r[2],r[3],r[4],r[5],r[6],w[1] ,c[1])
F(r[6],r[7],r[0],r[1],r[2],r[3],r[4],r[5],w[2] ,c[2])
F(r[5],r[6],r[7],r[0],r[1],r[2],r[3],r[4],w[3] ,c[3])
F(r[4],r[5],r[6],r[7],r[0],r[1],r[2],r[3],w[4] ,c[4])
F(r[3],r[4],r[5],r[6],r[7],r[0],r[1],r[2],w[5] ,c[5])
F(r[2],r[3],r[4],r[5],r[6],r[7],r[0],r[1],w[6] ,c[6])
F(r[1],r[2],r[3],r[4],r[5],r[6],r[7],r[0],w[7] ,c[7])
F(r[0],r[1],r[2],r[3],r[4],r[5],r[6],r[7],w[8] ,c[8])
F(r[7],r[0],r[1],r[2],r[3],r[4],r[5],r[6],w[9] ,c[9])
F(r[6],r[7],r[0],r[1],r[2],r[3],r[4],r[5],w[10],c[10])
F(r[5],r[6],r[7],r[0],r[1],r[2],r[3],r[4],w[11],c[11])
F(r[4],r[5],r[6],r[7],r[0],r[1],r[2],r[3],w[12],c[12])
F(r[3],r[4],r[5],r[6],r[7],r[0],r[1],r[2],w[13],c[13])
F(r[2],r[3],r[4],r[5],r[6],r[7],r[0],r[1],w[14],c[14])
F(r[1],r[2],r[3],r[4],r[5],r[6],r[7],r[0],w[15],c[15])
}
static const uint64 round[80] = {
0x428a2f98d728ae22ULL
, 0x7137449123ef65cdULL
, 0xb5c0fbcfec4d3b2fULL
, 0xe9b5dba58189dbbcULL
, 0x3956c25bf348b538ULL
, 0x59f111f1b605d019ULL
, 0x923f82a4af194f9bULL
, 0xab1c5ed5da6d8118ULL
, 0xd807aa98a3030242ULL
, 0x12835b0145706fbeULL
, 0x243185be4ee4b28cULL
, 0x550c7dc3d5ffb4e2ULL
, 0x72be5d74f27b896fULL
, 0x80deb1fe3b1696b1ULL
, 0x9bdc06a725c71235ULL
, 0xc19bf174cf692694ULL
, 0xe49b69c19ef14ad2ULL
, 0xefbe4786384f25e3ULL
, 0x0fc19dc68b8cd5b5ULL
, 0x240ca1cc77ac9c65ULL
, 0x2de92c6f592b0275ULL
, 0x4a7484aa6ea6e483ULL
, 0x5cb0a9dcbd41fbd4ULL
, 0x76f988da831153b5ULL
, 0x983e5152ee66dfabULL
, 0xa831c66d2db43210ULL
, 0xb00327c898fb213fULL
, 0xbf597fc7beef0ee4ULL
, 0xc6e00bf33da88fc2ULL
, 0xd5a79147930aa725ULL
, 0x06ca6351e003826fULL
, 0x142929670a0e6e70ULL
, 0x27b70a8546d22ffcULL
, 0x2e1b21385c26c926ULL
, 0x4d2c6dfc5ac42aedULL
, 0x53380d139d95b3dfULL
, 0x650a73548baf63deULL
, 0x766a0abb3c77b2a8ULL
, 0x81c2c92e47edaee6ULL
, 0x92722c851482353bULL
, 0xa2bfe8a14cf10364ULL
, 0xa81a664bbc423001ULL
, 0xc24b8b70d0f89791ULL
, 0xc76c51a30654be30ULL
, 0xd192e819d6ef5218ULL
, 0xd69906245565a910ULL
, 0xf40e35855771202aULL
, 0x106aa07032bbd1b8ULL
, 0x19a4c116b8d2d0c8ULL
, 0x1e376c085141ab53ULL
, 0x2748774cdf8eeb99ULL
, 0x34b0bcb5e19b48a8ULL
, 0x391c0cb3c5c95a63ULL
, 0x4ed8aa4ae3418acbULL
, 0x5b9cca4f7763e373ULL
, 0x682e6ff3d6b2b8a3ULL
, 0x748f82ee5defb2fcULL
, 0x78a5636f43172f60ULL
, 0x84c87814a1f0ab72ULL
, 0x8cc702081a6439ecULL
, 0x90befffa23631e28ULL
, 0xa4506cebde82bde9ULL
, 0xbef9a3f7b2c67915ULL
, 0xc67178f2e372532bULL
, 0xca273eceea26619cULL
, 0xd186b8c721c0c207ULL
, 0xeada7dd6cde0eb1eULL
, 0xf57d4f7fee6ed178ULL
, 0x06f067aa72176fbaULL
, 0x0a637dc5a2c898a6ULL
, 0x113f9804bef90daeULL
, 0x1b710b35131c471bULL
, 0x28db77f523047d84ULL
, 0x32caab7b40c72493ULL
, 0x3c9ebe0a15c9bebcULL
, 0x431d67c49c100d4cULL
, 0x4cc5d4becb3e42b6ULL
, 0x597f299cfc657e2aULL
, 0x5fcb6fab3ad6faecULL
, 0x6c44198c4a475817ULL
};
int crypto_hashblocks(unsigned char *statebytes,const unsigned char *in,long long inlen)
{
uint64 w[16];
uint64 state[8];
uint64 r[8];
int i;
for (i = 0;i < 8;++i)
state[i] = r[i] = load_bigendian(statebytes+8*i);
while (inlen >= 128) {
for (i = 0;i < 16;++i)
w[i] = load_bigendian(in+8*i);
handle(r,w,round+0);
expand(w);
handle(r,w,round+16);
expand(w);
handle(r,w,round+32);
expand(w);
handle(r,w,round+48);
expand(w);
handle(r,w,round+64);
for (i = 0;i < 8;++i) {
uint64 x = r[i]+state[i];
state[i] = x;
r[i] = x;
}
in += 128;
inlen -= 128;
}
for (i = 0;i < 8;++i)
store_bigendian(statebytes+8*i,state[i]);
return inlen;
}
@@ -0,0 +1 @@
../m3/implementors
@@ -0,0 +1 @@
../m3/api.h
@@ -0,0 +1,198 @@
#include "crypto_hashblocks.h"
typedef unsigned long long uint64;
static uint64 load_bigendian(const unsigned char *x)
{
return
(uint64) (x[7]) \
| (((uint64) (x[6])) << 8) \
| (((uint64) (x[5])) << 16) \
| (((uint64) (x[4])) << 24) \
| (((uint64) (x[3])) << 32) \
| (((uint64) (x[2])) << 40) \
| (((uint64) (x[1])) << 48) \
| (((uint64) (x[0])) << 56)
;
}
static void store_bigendian(unsigned char *x,uint64 u)
{
x[7] = u; u >>= 8;
x[6] = u; u >>= 8;
x[5] = u; u >>= 8;
x[4] = u; u >>= 8;
x[3] = u; u >>= 8;
x[2] = u; u >>= 8;
x[1] = u; u >>= 8;
x[0] = u;
}
#define SHR(x,c) ((x) >> (c))
#define ROTR(x,c) (((x) >> (c)) | ((x) << (64 - (c))))
#define sigma0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x,7))
#define sigma1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x,6))
static void expand(uint64 *w)
{
int i;
for (i = 0;i < 16;++i) {
w[i] += w[15&(i+9)];
w[i] += sigma1(w[15&(i+14)]);
w[i] += sigma0(w[15&(i+1)]);
}
}
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
#define Maj(x,y,z) ((x & (y ^ z)) ^ (y & z))
#define Sigma0(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
#define Sigma1(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
static void handle(uint64 *r,const uint64 *w,const uint64 *c)
{
int i;
uint64 x;
for (i = 0;i < 16;++i) {
x = r[7&(7-i)];
x += w[i];
x += c[i];
x += Sigma1(r[7&(4-i)]);
x += Ch(r[7&(4-i)],r[7&(5-i)],r[7&(6-i)]);
r[7&(3-i)] += x;
x += Sigma0(r[7&(0-i)]);
x += Maj(r[7&(0-i)],r[7&(1-i)],r[7&(2-i)]);
r[7&(7-i)] = x;
}
}
static const uint64 round[80] = {
0x428a2f98d728ae22ULL
, 0x7137449123ef65cdULL
, 0xb5c0fbcfec4d3b2fULL
, 0xe9b5dba58189dbbcULL
, 0x3956c25bf348b538ULL
, 0x59f111f1b605d019ULL
, 0x923f82a4af194f9bULL
, 0xab1c5ed5da6d8118ULL
, 0xd807aa98a3030242ULL
, 0x12835b0145706fbeULL
, 0x243185be4ee4b28cULL
, 0x550c7dc3d5ffb4e2ULL
, 0x72be5d74f27b896fULL
, 0x80deb1fe3b1696b1ULL
, 0x9bdc06a725c71235ULL
, 0xc19bf174cf692694ULL
, 0xe49b69c19ef14ad2ULL
, 0xefbe4786384f25e3ULL
, 0x0fc19dc68b8cd5b5ULL
, 0x240ca1cc77ac9c65ULL
, 0x2de92c6f592b0275ULL
, 0x4a7484aa6ea6e483ULL
, 0x5cb0a9dcbd41fbd4ULL
, 0x76f988da831153b5ULL
, 0x983e5152ee66dfabULL
, 0xa831c66d2db43210ULL
, 0xb00327c898fb213fULL
, 0xbf597fc7beef0ee4ULL
, 0xc6e00bf33da88fc2ULL
, 0xd5a79147930aa725ULL
, 0x06ca6351e003826fULL
, 0x142929670a0e6e70ULL
, 0x27b70a8546d22ffcULL
, 0x2e1b21385c26c926ULL
, 0x4d2c6dfc5ac42aedULL
, 0x53380d139d95b3dfULL
, 0x650a73548baf63deULL
, 0x766a0abb3c77b2a8ULL
, 0x81c2c92e47edaee6ULL
, 0x92722c851482353bULL
, 0xa2bfe8a14cf10364ULL
, 0xa81a664bbc423001ULL
, 0xc24b8b70d0f89791ULL
, 0xc76c51a30654be30ULL
, 0xd192e819d6ef5218ULL
, 0xd69906245565a910ULL
, 0xf40e35855771202aULL
, 0x106aa07032bbd1b8ULL
, 0x19a4c116b8d2d0c8ULL
, 0x1e376c085141ab53ULL
, 0x2748774cdf8eeb99ULL
, 0x34b0bcb5e19b48a8ULL
, 0x391c0cb3c5c95a63ULL
, 0x4ed8aa4ae3418acbULL
, 0x5b9cca4f7763e373ULL
, 0x682e6ff3d6b2b8a3ULL
, 0x748f82ee5defb2fcULL
, 0x78a5636f43172f60ULL
, 0x84c87814a1f0ab72ULL
, 0x8cc702081a6439ecULL
, 0x90befffa23631e28ULL
, 0xa4506cebde82bde9ULL
, 0xbef9a3f7b2c67915ULL
, 0xc67178f2e372532bULL
, 0xca273eceea26619cULL
, 0xd186b8c721c0c207ULL
, 0xeada7dd6cde0eb1eULL
, 0xf57d4f7fee6ed178ULL
, 0x06f067aa72176fbaULL
, 0x0a637dc5a2c898a6ULL
, 0x113f9804bef90daeULL
, 0x1b710b35131c471bULL
, 0x28db77f523047d84ULL
, 0x32caab7b40c72493ULL
, 0x3c9ebe0a15c9bebcULL
, 0x431d67c49c100d4cULL
, 0x4cc5d4becb3e42b6ULL
, 0x597f299cfc657e2aULL
, 0x5fcb6fab3ad6faecULL
, 0x6c44198c4a475817ULL
};
int crypto_hashblocks(unsigned char *statebytes,const unsigned char *in,long long inlen)
{
uint64 w[16];
uint64 state[8];
uint64 r[8];
int i;
for (i = 0;i < 8;++i)
state[i] = r[i] = load_bigendian(statebytes+8*i);
while (inlen >= 128) {
for (i = 0;i < 16;++i)
w[i] = load_bigendian(in+8*i);
handle(r,w,round+0);
expand(w);
handle(r,w,round+16);
expand(w);
handle(r,w,round+32);
expand(w);
handle(r,w,round+48);
expand(w);
handle(r,w,round+64);
for (i = 0;i < 8;++i) {
uint64 x = r[i]+state[i];
state[i] = x;
r[i] = x;
}
in += 128;
inlen -= 128;
}
for (i = 0;i < 8;++i)
store_bigendian(statebytes+8*i,state[i]);
return inlen;
}
@@ -0,0 +1 @@
../m3/implementors
@@ -0,0 +1 @@
../m3/api.h
@@ -0,0 +1,170 @@
#include "crypto_hashblocks.h"
typedef unsigned long long uint64;
static uint64 load_bigendian(const unsigned char *x)
{
return
(uint64) (x[7]) \
| (((uint64) (x[6])) << 8) \
| (((uint64) (x[5])) << 16) \
| (((uint64) (x[4])) << 24) \
| (((uint64) (x[3])) << 32) \
| (((uint64) (x[2])) << 40) \
| (((uint64) (x[1])) << 48) \
| (((uint64) (x[0])) << 56)
;
}
static void store_bigendian(unsigned char *x,uint64 u)
{
x[7] = u; u >>= 8;
x[6] = u; u >>= 8;
x[5] = u; u >>= 8;
x[4] = u; u >>= 8;
x[3] = u; u >>= 8;
x[2] = u; u >>= 8;
x[1] = u; u >>= 8;
x[0] = u;
}
#define SHR(x,c) ((x) >> (c))
#define ROTR(x,c) (((x) >> (c)) | ((x) << (64 - (c))))
#define sigma0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x,7))
#define sigma1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x,6))
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
#define Maj(x,y,z) ((x & (y ^ z)) ^ (y & z))
#define Sigma0(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
#define Sigma1(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
static const uint64 round[80] = {
0x428a2f98d728ae22ULL
, 0x7137449123ef65cdULL
, 0xb5c0fbcfec4d3b2fULL
, 0xe9b5dba58189dbbcULL
, 0x3956c25bf348b538ULL
, 0x59f111f1b605d019ULL
, 0x923f82a4af194f9bULL
, 0xab1c5ed5da6d8118ULL
, 0xd807aa98a3030242ULL
, 0x12835b0145706fbeULL
, 0x243185be4ee4b28cULL
, 0x550c7dc3d5ffb4e2ULL
, 0x72be5d74f27b896fULL
, 0x80deb1fe3b1696b1ULL
, 0x9bdc06a725c71235ULL
, 0xc19bf174cf692694ULL
, 0xe49b69c19ef14ad2ULL
, 0xefbe4786384f25e3ULL
, 0x0fc19dc68b8cd5b5ULL
, 0x240ca1cc77ac9c65ULL
, 0x2de92c6f592b0275ULL
, 0x4a7484aa6ea6e483ULL
, 0x5cb0a9dcbd41fbd4ULL
, 0x76f988da831153b5ULL
, 0x983e5152ee66dfabULL
, 0xa831c66d2db43210ULL
, 0xb00327c898fb213fULL
, 0xbf597fc7beef0ee4ULL
, 0xc6e00bf33da88fc2ULL
, 0xd5a79147930aa725ULL
, 0x06ca6351e003826fULL
, 0x142929670a0e6e70ULL
, 0x27b70a8546d22ffcULL
, 0x2e1b21385c26c926ULL
, 0x4d2c6dfc5ac42aedULL
, 0x53380d139d95b3dfULL
, 0x650a73548baf63deULL
, 0x766a0abb3c77b2a8ULL
, 0x81c2c92e47edaee6ULL
, 0x92722c851482353bULL
, 0xa2bfe8a14cf10364ULL
, 0xa81a664bbc423001ULL
, 0xc24b8b70d0f89791ULL
, 0xc76c51a30654be30ULL
, 0xd192e819d6ef5218ULL
, 0xd69906245565a910ULL
, 0xf40e35855771202aULL
, 0x106aa07032bbd1b8ULL
, 0x19a4c116b8d2d0c8ULL
, 0x1e376c085141ab53ULL
, 0x2748774cdf8eeb99ULL
, 0x34b0bcb5e19b48a8ULL
, 0x391c0cb3c5c95a63ULL
, 0x4ed8aa4ae3418acbULL
, 0x5b9cca4f7763e373ULL
, 0x682e6ff3d6b2b8a3ULL
, 0x748f82ee5defb2fcULL
, 0x78a5636f43172f60ULL
, 0x84c87814a1f0ab72ULL
, 0x8cc702081a6439ecULL
, 0x90befffa23631e28ULL
, 0xa4506cebde82bde9ULL
, 0xbef9a3f7b2c67915ULL
, 0xc67178f2e372532bULL
, 0xca273eceea26619cULL
, 0xd186b8c721c0c207ULL
, 0xeada7dd6cde0eb1eULL
, 0xf57d4f7fee6ed178ULL
, 0x06f067aa72176fbaULL
, 0x0a637dc5a2c898a6ULL
, 0x113f9804bef90daeULL
, 0x1b710b35131c471bULL
, 0x28db77f523047d84ULL
, 0x32caab7b40c72493ULL
, 0x3c9ebe0a15c9bebcULL
, 0x431d67c49c100d4cULL
, 0x4cc5d4becb3e42b6ULL
, 0x597f299cfc657e2aULL
, 0x5fcb6fab3ad6faecULL
, 0x6c44198c4a475817ULL
};
int crypto_hashblocks(unsigned char *statebytes,const unsigned char *in,long long inlen)
{
uint64 w[16];
uint64 state[8];
uint64 r[8];
uint64 x;
int i;
for (i = 0;i < 8;++i)
state[i] = r[i] = load_bigendian(statebytes+8*i);
while (inlen >= 128) {
for (i = 0;i < 16;++i)
w[i] = load_bigendian(in+8*i);
for (i = 0;i < 80;++i) {
x = r[7&(7-i)];
x += w[15&i];
x += round[i];
x += Sigma1(r[7&(4-i)]);
x += Ch(r[7&(4-i)],r[7&(5-i)],r[7&(6-i)]);
r[7&(3-i)] += x;
x += Sigma0(r[7&(0-i)]);
x += Maj(r[7&(0-i)],r[7&(1-i)],r[7&(2-i)]);
r[7&(7-i)] = x;
/* not used for i >= 64: */
w[15&i] += w[15&(i+9)];
w[15&i] += sigma1(w[15&(i+14)]);
w[15&i] += sigma0(w[15&(i+1)]);
}
for (i = 0;i < 8;++i) {
uint64 x = r[i]+state[i];
state[i] = x;
r[i] = x;
}
in += 128;
inlen -= 128;
}
for (i = 0;i < 8;++i)
store_bigendian(statebytes+8*i,state[i]);
return inlen;
}

Some files were not shown because too many files have changed in this diff Show More