DJB lib25519 Rust wrapper

2023-06-20 17:19:09 +02:00
1695 changed files with 337040 additions and 1 deletions
@@ -102,7 +102,7 @@ default-members = [
    "explorer-api",
 ]

-exclude = ["explorer", "contracts", "clients/webassembly", "nym-wallet", "nym-connect/mobile/src-tauri", "nym-connect/desktop", "cpu-cycles"]
+exclude = ["explorer", "contracts", "clients/webassembly", "nym-wallet", "nym-connect/mobile/src-tauri", "nym-connect/desktop", "cpu-cycles", "lib-25519"]

 [workspace.package]
 authors = ["Nym Technologies SA"]
@@ -0,0 +1,23 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "lib_25519"
+version = "0.1.0"
+dependencies = [
+ "cfg-if",
+ "libc",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.146"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b"
@@ -0,0 +1,12 @@
+[package]
+name = "lib_25519"
+version = "0.1.0"
+edition = "2021"
+build = "build.rs"
+links = "25519"
+
+[dependencies]
+libc = "0.2.140"
+
+[build-dependencies]
+cfg-if = "1"
@@ -0,0 +1,69 @@
+use std::{env, path::PathBuf, process::Command};
+
+fn main() {
+    let out_dir = env::var("OUT_DIR").unwrap();
+    let out_path = PathBuf::from(&out_dir);
+    let source_path = PathBuf::from("lib25519")
+        .canonicalize()
+        .expect("cannot canonicalize path");
+
+    cfg_if::cfg_if! {
+        if #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "mips", target_arch = "powerpc", target_arch = "powerpc64", target_arch = "arm")))] {
+            panic!("Unsupported architecture - {}!", env::var("CARGO_CFG_TARGET_ARCH").unwrap(), )
+        }
+    };
+
+    let mut compile_o_command = Command::new("./configure");
+    let compile_o_command = compile_o_command
+        .current_dir(&source_path)
+        .arg(format!("--prefix={out_dir}"));
+
+    match compile_o_command.output() {
+        Ok(output) => {
+            if !output.status.success() {
+                panic!("{:?}", unsafe {
+                    std::str::from_utf8_unchecked(&output.stderr)
+                })
+            }
+        }
+        Err(e) => panic!("{e}"),
+    }
+
+    let mut compile_o_command = Command::new("make");
+    let compile_o_command = compile_o_command
+        .current_dir(&source_path)
+        .arg("-j8")
+        .arg("install");
+
+    match compile_o_command.output() {
+        Ok(output) => {
+            if !output.status.success() {
+                panic!("{:?}", unsafe {
+                    std::str::from_utf8_unchecked(&output.stderr)
+                })
+            }
+        }
+        Err(e) => panic!("{e}"),
+    }
+
+    println!(
+        "cargo:rustc-link-search=native={}",
+        out_path.join("lib").to_str().unwrap()
+    );
+    println!("cargo:rustc-link-lib=static=25519");
+    println!("cargo:rustc-link-lib=static=randombytes_kernel");
+
+    let mut compile_o_command = Command::new("make");
+    let compile_o_command = compile_o_command.current_dir(source_path).arg("clean");
+
+    match compile_o_command.output() {
+        Ok(output) => {
+            if !output.status.success() {
+                panic!("{:?}", unsafe {
+                    std::str::from_utf8_unchecked(&output.stderr)
+                })
+            }
+        }
+        Err(e) => panic!("{e}"),
+    }
+}
@@ -0,0 +1,9 @@
+Files: *
+Copyright: 2022 Kaushik Nath
+Copyright: 2011-2022 Daniel J. Bernstein
+Copyright: 2015 Tung Chou
+Copyright: 2011-2014 Peter Schwabe
+Copyright: 2011 Niels Duif
+Copyright: 2011 Tanja Lange
+Copyright: 2011 Bo-Yin Yang
+License: CC0-1.0
@@ -0,0 +1,297 @@
+lib25519 draws on many previous implementations listed below, plus new
+speedups from Kaushik Nath and new infrastructure work and factoring
+from Daniel J. Bernstein. All software is in the public domain. Since
+some organizations require licenses, lib25519 is also CC0-licensed.
+
+Some code was originally copied from public-domain code in the SUPERCOP
+benchmarking framework. See https://bench.cr.yp.to/supercop.html for
+SUPERCOP releases. The following small changes from code available in
+SUPERCOP are made in lib25519 without further comment:
+
+   * Returning void rather than int for functions that never fail in
+     lib25519.
+   * Message lengths long long rather than unsigned long long.
+   * Defining various constants in .c files (to simplify PIC handling)
+     instead of .S files.
+   * Moving some C files to shared-*.c (which in lib25519 means that
+     these files are compiled by only one compiler).
+   * Using CRYPTO_SHARED_NAMESPACE rather than CRYPTO_NAMESPACE for
+     symbols defined in *.S and shared-*.c.
+   * Adding various "linker define" and "linker use" lines.
+
+Larger changes from code in SUPERCOP, such as new code divisions across
+lib25519 primitives, are indicated below.
+
+Sources of Curve25519 software (this is not a comprehensive list, just
+the software that lib25519 is derived from):
+
+   * Daniel J. Bernstein. "Curve25519: new Diffie-Hellman speed
+     records." Pages 207–228 in Public key cryptography—PKC 2006, 9th
+     international conference on theory and practice in public-key
+     cryptography, New York, NY, USA, April 24–26, 2006, proceedings,
+     edited by Moti Yung, Yevgeniy Dodis, Aggelos Kiayias, Tal Malkin,
+     Lecture Notes in Computer Science 3958, Springer, 2006, ISBN
+     3-540-33851-9.
+
+     This is the source of the Curve25519 design, the X25519 design, and
+     various speedups. Most of the software from that paper is specific
+     to a variety of 32-bit platforms (radix 2^25.5 or radix 2^21.25),
+     but the portable supercop/crypto_scalarmult/curve25519/ref10 (radix
+     2^25.5) is derived from this.
+
+     lib25519/crypto_nP/montgomery25519/ref10 starts with
+     supercop/crypto_scalarmult/curve25519/ref10, and tweaks the API to
+     provide crypto_nP instead of crypto_scalarmult. Inversion is
+     factored out, producing crypto_pow/inv25519/ref10. The trivial
+     crypto_scalarmult_base wrapper is factored out, producing
+     crypto_nG/montgomery25519/ref/base.c; lib25519 has faster nG
+     functions, but intentionally provides ref for situations where
+     speed is outweighed by simplicity, assurance, code size, etc.
+
+   * supercop/crypto_scalarmult/curve25519/donna_c64 (radix 2^51) from
+     Adam Langley.
+
+     lib25519/crypto_nP/montgomery25519/donna_c64 starts from this and
+     tweaks the API to provide crypto_nP instead of crypto_scalarmult
+     (and removes crypto_scalarmult_base). crypto_pow/inv25519/donna_c64
+     is factored out.
+
+   * Daniel J. Bernstein, Niels Duif, Tanja Lange, Peter Schwabe, Bo-Yin
+     Yang. "High-speed high-security signatures." Pages 124–142 in
+     Cryptographic hardware and embedded systems—CHES 2011, 13th
+     international workshop, Nara, Japan, September 28–October 1, 2011,
+     proceedings, edited by Bart Preneel, Tsuyoshi Takagi, Lecture Notes
+     in Computer Science 6917, Springer, 2011, ISBN 978-3-642-23950-2.
+     Journal version: Journal of Cryptographic Engineering 2 (2012),
+     77–89. 
+
+     This is the source of the Ed25519 design and various X25519/Ed25519
+     speedups for 64-bit Intel/AMD platforms, in particular producing
+     supercop/crypto_{scalarmult/curve,sign/ed}25519/amd64-{51,64}*
+     (radix 2^51 and radix 2^64 respectively). Peter Schwabe led the
+     implementation work.
+
+     lib25519/crypto_nP/montgomery25519/amd64-51 starts from
+     supercop/crypto_scalarmult/curve25519/amd64-51 and tweaks the API
+     to provide crypto_nP instead of crypto_scalarmult (and removes
+     crypto_scalarmult_base). crypto_nG/merged25519/amd64-51 (for
+     fixed-base-point multiplication), crypto_mGnP/ed25519/amd64-51 (for
+     double-scalar multiplication), and crypto_sign/ed25519/amd64 (for
+     the remaining signing components) factor
+     supercop/crypto_sign/ed25519/amd64-51 into smaller pieces.
+     crypto_pow/inv25519/amd64-51 is also factored out. "SMALLTABLES"
+     support is removed. Support for batch verification is removed,
+     although it could reappear in a subsequent lib25519 release.
+
+     Similar comments apply to amd64-64 and ref10. A compiler warning
+     is eliminated (window size 64 in amd64-64-24k/sc25519.h).
+
+   * Tung Chou. "Sandy2x: New Curve25519 Speed Records." SAC 2015.
+
+     This is the source of various X25519 speedups using 256-bit vector
+     instructions, specifically AVX vector instructions in Intel's Sandy
+     Bridge, in particular producing
+     supercop/crypto_scalarmult/curve25519/sandy2x (radix 2^25.5).
+
+     lib25519/crypto_{nP,nG}/montgomery25519/sandy2x start from
+     supercop/crypto_scalarmult/curve25519/sandy2x, and tweak the API to
+     provide crypto_nP and crypto_nG instead of crypto_scalarmult and
+     crypto_scalarmult_base respectively. The top bit of the incoming
+     point is cleared. crypto_pow/inv25519/sandy2x is factored out.
+
+   * Kaushik Nath and Palash Sarkar, "Efficient arithmetic in
+     (pseudo-)Mersenne prime order fields", Advances in Mathematics of
+     Communications 16 (2022), pages 303–348.
+
+     Original release:
+     https://github.com/kn-cs/pmp-farith/tree/master/gf-p2-255-19/SL
+     https://github.com/kn-cs/pmp-farith/tree/master/gf-p2-255-19/USL1
+
+     The "SL" software is the source of various speedups to the amd64-64
+     software, producing the "maa4" versions of fe25519_mul.S,
+     fe25519_square.S, and fe25519_nsquare.S. These .S files are used
+     inside the following lib25519 directories:
+     crypto_mGnP/ed25519/amd64-avx2-10l-maa4
+     crypto_mGnP/ed25519/amd64-avx2-9l-maa4
+     crypto_mGnP/ed25519/amd64-maa4
+     crypto_nG/merged25519/amd64-avx2-10l-maa4
+     crypto_nG/merged25519/amd64-avx2-9l-maa4
+     crypto_nG/merged25519/amd64-maa4
+     crypto_nP/montgomery25519/amd64-avx2-hey10l-maa4
+     crypto_nP/montgomery25519/amd64-avx2-hey9l-maa4
+     crypto_nP/montgomery25519/amd64-avx2-ns10l-maa4
+     crypto_nP/montgomery25519/amd64-avx2-ns9l-maa4
+     crypto_nP/montgomery25519/amd64-maa4
+     crypto_pow/inv25519/amd64-maa4
+
+     The "USL" software is the source of various speedups to the
+     amd64-51 software, producing the "maa5" versions of fe25519_mul.S
+     and fe25519_nsquare.S. These .S files are used inside the following
+     lib25519 directories:
+     crypto_nP/montgomery25519/amd64-avx2-hey10l-maa5
+     crypto_nP/montgomery25519/amd64-avx2-hey9l-maa5
+     crypto_nP/montgomery25519/amd64-avx2-ns10l-maa5
+     crypto_nP/montgomery25519/amd64-avx2-ns9l-maa5
+     crypto_pow/inv25519/amd64-maa5
+
+   * Kaushik Nath and Palash Sarkar, "Security and efficiency trade-offs
+     for elliptic curve Diffie-Hellman at the 128-bit and 224-bit
+     security levels." J. Cryptogr. Eng. 12(1): 107-121 (2022).
+
+     Original release:
+     https://github.com/kn-cs/x25519/tree/master/intel64-mxaa-4limb
+     https://github.com/kn-cs/x25519
+
+     This "mxaa-4limb" software is the source of various speedups to
+     "maa4" on CPUs supporting BMI2 instructions (e.g., Intel Haswell
+     from 2013), in particular producing the "mxaa" versions of
+     fe25519_mul.S and fe25519_nsquare.S. These .S files are used inside
+     the following lib25519 directories:
+     crypto_mGnP/ed25519/amd64-avx2-10l-mxaa
+     crypto_mGnP/ed25519/amd64-avx2-9l-mxaa
+     crypto_mGnP/ed25519/amd64-mxaa
+     crypto_nG/merged25519/amd64-avx2-10l-mxaa
+     crypto_nG/merged25519/amd64-avx2-9l-mxaa
+     crypto_nG/merged25519/amd64-mxaa
+     crypto_nP/montgomery25519/amd64-avx2-hey10l-mxaa
+     crypto_nP/montgomery25519/amd64-avx2-hey9l-mxaa
+     crypto_nP/montgomery25519/amd64-avx2-ns10l-mxaa
+     crypto_nP/montgomery25519/amd64-avx2-ns9l-mxaa
+     crypto_nP/montgomery25519/amd64-mxaa
+     crypto_pow/inv25519/amd64-mxaa
+
+     This software is also the source of the following three different
+     Montgomery-ladder functions, where the third also builds on the
+     "maax" work listed below:
+     crypto_nP/montgomery25519/amd64-maa4/mladder.S
+     crypto_nP/montgomery25519/amd64-mxaa/mladder.S
+     crypto_nP/montgomery25519/amd64-maax/mladder.S
+
+   * Kaushik Nath and Palash Sarkar, "Efficient arithmetic in
+     (pseudo-)Mersenne prime order fields", Advances in Mathematics of
+     Communications 16 (2022), pages 303–348. Original release:
+     https://github.com/kn-cs/pmp-farith/tree/master/gf-p2-255-19/SLDCC
+
+     This is the source of various speedups to "mxaa" on CPUs that also
+     support ADX instructions (e.g., Intel Broadwell from 2014), in
+     particular producing the "maax" versions of fe25519_mul.S,
+     fe25519_square.S, and fe25519_nsquare.S. These .S files are used
+     inside the following lib25519 directories:
+     crypto_mGnP/ed25519/amd64-avx2-10l-maax
+     crypto_mGnP/ed25519/amd64-avx2-9l-maax
+     crypto_mGnP/ed25519/amd64-avx512ifma-5l-maax
+     crypto_mGnP/ed25519/amd64-maax
+     crypto_nG/merged25519/amd64-avx2-10l-maax
+     crypto_nG/merged25519/amd64-avx2-9l-maax
+     crypto_nG/merged25519/amd64-avx512ifma-5l-maax
+     crypto_nG/merged25519/amd64-maax
+     crypto_nP/montgomery25519/amd64-avx2-hey10l-maax
+     crypto_nP/montgomery25519/amd64-avx2-hey9l-maax
+     crypto_nP/montgomery25519/amd64-avx2-ns10l-maax
+     crypto_nP/montgomery25519/amd64-avx2-ns9l-maax
+     crypto_nP/montgomery25519/amd64-avx512-hey10l-maax
+     crypto_nP/montgomery25519/amd64-avx512-hey9l-maax
+     crypto_nP/montgomery25519/amd64-avx512-ns10l-maax
+     crypto_nP/montgomery25519/amd64-avx512-ns9l-maax     
+     crypto_nP/montgomery25519/amd64-avx512ifma-hey5l-maax
+     crypto_nP/montgomery25519/amd64-avx512ifma-ns5l-maax
+     crypto_nP/montgomery25519/amd64-maax
+     crypto_pow/inv25519/amd64-maax
+
+   * Kaushik Nath and Palash Sarkar, "Efficient 4-Way Vectorizations of
+     the Montgomery Ladder". IEEE Trans. Computers 71(3): 712-723
+     (2022). Original release:
+     https://github.com/kn-cs/vec-ladder/tree/master/Curve25519
+
+     This is the source of the "hey10l" (radix 2^25.5), "hey9l" (radix
+     2^29), "ns10l" (radix 2^25.5), and "ns9l" (radix 2^29) versions of
+     mladder.S for CPUs that also support 256-bit AVX2 instructions
+     (e.g., Intel Haswell from 2013). In lib25519, these four .S files
+     are used in 16 directories:
+     crypto_nP/montgomery25519/amd64-avx2-hey10l-{maa4,maa5,maax,mxaa}
+     crypto_nP/montgomery25519/amd64-avx2-hey9l-{maa4,maa5,maax,mxaa}
+     crypto_nP/montgomery25519/amd64-avx2-ns10l-{maa4,maa5,maax,mxaa}
+     crypto_nP/montgomery25519/amd64-avx2-ns9l-{maa4,maa5,maax,mxaa}
+
+   * Kaushik Nath, new Montgomery-ladder code new in lib25519 (no paper
+     yet) for CPUs supporting AVX-512 instructions (e.g., Intel
+     Skylake-X from 2017). These are six files in lib25519:
+     crypto_nP/montgomery25519/amd64-avx512-hey10l-maax
+     crypto_nP/montgomery25519/amd64-avx512-hey9l-maax
+     crypto_nP/montgomery25519/amd64-avx512-ns10l-maax
+     crypto_nP/montgomery25519/amd64-avx512-ns9l-maax
+     crypto_nP/montgomery25519/amd64-avx512ifma-hey5l-maax
+     crypto_nP/montgomery25519/amd64-avx512ifma-ns5l-maax
+
+   * Kaushik Nath, nine versions of fixed-base-point
+     scalar-multiplication code new in lib25519 (no paper yet) for
+     various platforms:
+     crypto_nG/merged25519/amd64-avx2-10l-maa4/ge25519_base.S
+     crypto_nG/merged25519/amd64-avx2-10l-maax/ge25519_base.S
+     crypto_nG/merged25519/amd64-avx2-10l-mxaa/ge25519_base.S
+     crypto_nG/merged25519/amd64-avx2-9l-maa4/ge25519_base.S
+     crypto_nG/merged25519/amd64-avx2-9l-maax/ge25519_base.S
+     crypto_nG/merged25519/amd64-avx2-9l-mxaa/ge25519_base.S
+     crypto_nG/merged25519/amd64-avx512ifma-5l-maax/ge25519_base.S
+     crypto_nG/merged25519/amd64-maa4/ge25519_base.S
+     crypto_nG/merged25519/amd64-maax/ge25519_base.S
+     crypto_nG/merged25519/amd64-mxaa/ge25519_base.S
+
+   * Kaushik Nath, ten versions of double-scalar-multiplication code new
+     in lib25519 (no paper yet) for various platforms. Each version has
+     precompute.S and process.S:
+
+     crypto_mGnP/ed25519/amd64-avx2-10l-maa4/ge25519_double_scalarmult_precompute.S
+     crypto_mGnP/ed25519/amd64-avx2-10l-maax/ge25519_double_scalarmult_precompute.S
+     crypto_mGnP/ed25519/amd64-avx2-10l-mxaa/ge25519_double_scalarmult_precompute.S
+     crypto_mGnP/ed25519/amd64-avx2-9l-maa4/ge25519_double_scalarmult_precompute.S
+     crypto_mGnP/ed25519/amd64-avx2-9l-maax/ge25519_double_scalarmult_precompute.S
+     crypto_mGnP/ed25519/amd64-avx2-9l-mxaa/ge25519_double_scalarmult_precompute.S
+     crypto_mGnP/ed25519/amd64-avx512ifma-5l-maax/ge25519_double_scalarmult_precompute.S
+     crypto_mGnP/ed25519/amd64-maa4/ge25519_double_scalarmult_precompute.S
+     crypto_mGnP/ed25519/amd64-maax/ge25519_double_scalarmult_precompute.S
+     crypto_mGnP/ed25519/amd64-mxaa/ge25519_double_scalarmult_precompute.S
+
+     crypto_mGnP/ed25519/amd64-avx2-10l-maa4/ge25519_double_scalarmult_process.S
+     crypto_mGnP/ed25519/amd64-avx2-10l-maax/ge25519_double_scalarmult_process.S
+     crypto_mGnP/ed25519/amd64-avx2-10l-mxaa/ge25519_double_scalarmult_process.S
+     crypto_mGnP/ed25519/amd64-avx2-9l-maa4/ge25519_double_scalarmult_process.S
+     crypto_mGnP/ed25519/amd64-avx2-9l-maax/ge25519_double_scalarmult_process.S
+     crypto_mGnP/ed25519/amd64-avx2-9l-mxaa/ge25519_double_scalarmult_process.S
+     crypto_mGnP/ed25519/amd64-avx512ifma-5l-maax/ge25519_double_scalarmult_process.S
+     crypto_mGnP/ed25519/amd64-maa4/ge25519_double_scalarmult_process.S
+     crypto_mGnP/ed25519/amd64-maax/ge25519_double_scalarmult_process.S
+     crypto_mGnP/ed25519/amd64-mxaa/ge25519_double_scalarmult_process.S
+
+Almost all of the crypto_pow/inv25519 implementations use exponentiation,
+but there is also a different implementation from the following source:
+
+   * Daniel J. Bernstein, Bo-Yin Yang. "Fast constant-time gcd
+     computation and modular inversion." IACR Transactions on
+     Cryptographic Hardware and Embedded Systems 2019 issue 3 (2019),
+     340–398.
+
+     This is the source of the "safegcd" algorithm and software. Further
+     speedups (no paper yet; ideas from Peter Dettman, Gregory Maxwell,
+     and Pieter Wuille) have produced the "inverse25519skylake" software
+     available here: https://gcd.cr.yp.to/software.html
+
+     lib25519/crypto_pow/inv25519/amd64-safegcd is inverse25519skylake,
+     tweaked to provide the crypto_pow API and to clear the top bit of
+     the input.
+
+For lower-layer SHA-512 functions:
+
+   * Daniel J. Bernstein, supercop/crypto_hash*/sha512/*. In lib25519,
+     some unused variables are removed in crypto_hashblocks/sha512/avx
+     to eliminate compiler warnings.
+
+Most of the lib25519 infrastructure, such as the run-time implementation
+selector automatically guided by CPU type and previous benchmarks, is
+new in lib25519 from Daniel J. Bernstein. Portions of autogen-speed
+(generating lib25519-speed.c) and autogen-test (generating
+lib25519-test.c) are based on benchmarking software and test software in
+SUPERCOP by Daniel J. Bernstein. The symmetric-cryptography code for
+generating pseudorandom test inputs and hashing test outputs is adapted
+from TweetNaCl, a library by Daniel J. Bernstein, Wesley Janssen, Tanja
+Lange, and Peter Schwabe.
@@ -0,0 +1,53 @@
+Prerequisites: python3; gcc and/or clang.
+
+For sysadmins, to install in /usr/local/{include,lib,bin}:
+
+    ./configure && make -j8 install
+
+For users, to install in $HOME/{include,lib,bin}:
+
+    ./configure --prefix=$HOME && make -j8 install
+
+For distributors creating a package: Run
+
+    ./configure --prefix=/usr && make -j8
+
+and then follow your usual packaging procedures for the .h files in
+build/0/package/include, the libraries in build/0/package/lib, and the
+test programs in build/0/package/bin.
+
+The long-term plan is to split off some components of lib25519 into
+their own packages, and distributors may already wish to package these
+components accordingly:
+
+   * -lcpucycles will be its own package.
+
+   * -lrandombytes_kernel will be its own package.
+
+   * -lrandombytes will be an indirection layer providing randombytes.h
+     and supporting an ecosystem of randombytes() implementations (via,
+     e.g., Debian's /etc/alternatives), such as -lrandombytes_kernel,
+     -lrandombytes_per_process_rng, etc.
+
+   * -l25519 will be the main lib25519 package, using -lrandombytes and
+     -lcpucycles.
+
+lib25519-test already uses -l25519 without -lrandombytes: it substitutes
+its own knownrandombytes() to generate test vectors.
+
+More options: You can run
+
+    ./configure --host=amd64
+
+to override ./configure's guess of the architecture that it should
+compile for. The architecture controls which implementations to try
+(see crypto_*/*/*/architectures) and which compilers to try (see
+compilers/*).
+
+Inside the build directory, 0 is symlinked to amd64 for --host=amd64.
+Running "make clean" removes build/amd64. Re-running ./configure
+automatically starts with "make clean".
+
+A subsequent ./configure --host=arm will create build/arm and symlink
+0 -> arm, without touching an existing build/amd64. However,
+cross-compilers aren't yet selected automatically.
@@ -0,0 +1,405 @@
+This file explains the internal structure of lib25519, and explains how
+to add new instruction sets and new implementations.
+
+## Primitives
+
+The directories `crypto_*/*` inside lib25519 define the following
+primitives (see also `autogen-test` for Python versions of the
+mathematical primitives):
+
+* `crypto_verify/32`: `crypto_verify_32(s,t)` returns 0 when the 32-byte
+  arrays `s` and `t` are equal, otherwise `-1`. This function takes
+  constant time.
+
+* `crypto_hashblocks/sha512`: `crypto_hashblocks_sha512(h,x,xlen)`
+  updates an intermediate SHA-512 hash `h` using all of the full
+  128-byte blocks at the beginning of the `xlen`-byte array `x`, and
+  returns the number of bytes left over, namely `xlen` mod 128. This
+  function takes time that depends on `xlen` but not on the contents of
+  `h` or `x`.
+
+* `crypto_hash/sha512`: `crypto_hash_sha512(h,x,xlen)` computes the
+  SHA-512 hash `h` of the `xlen`-byte array `x`. This function takes
+  time that depends on `xlen` but not on the contents of `x`.
+
+* `crypto_pow/inv25519`: `crypto_pow_inv25519(y,x)` computes the
+  2^255^−21 power `y` of an integer `x` modulo 2^255^−19. This is the
+  same as the inverse of `x` modulo 2^255^−19 if `x` is not divisible by
+  2^255^−19. The integers `x` and `y` are represented as a 32-byte array
+  in little-endian form. This function takes constant time.
+
+  This function guarantees that the output `y` is frozen modulo
+  2^255^−19, i.e., completely reduced to the range 0,1,...,2^255^−20. The
+  caller is expected to freeze `x` before calling this function. The
+  function accepts `x` in the range {0,1,...,2^256^−1} while ignoring the
+  top bit (the coefficient of 2^255^ in binary): i.e., the function
+  reduces `x` modulo 2^255^ and then modulo 2^255^−19.
+
+* `crypto_nP/montgomery25519`: `crypto_nP_montgomery25519(nP,n,P)`
+  computes the X25519 function: in short, if a Curve25519 point has
+  x-coordinate `P` then the `n`th multiple of the point has x-coordinate
+  `nP`. The inputs and outputs are represented as 32-byte arrays in
+  little-endian form. This function takes constant time.
+
+  X25519 is defined for `n` in the range 2^254^ + 8{0,1,2,3,...,2^251^−1}.
+  `crypto_nP_montgomery25519` allows `n` in the wider range
+  {0,1,...,2^256^−1}, and in all cases computes `m`th multiples where `m`
+  is defined as follows: make a copy of `n`, clear the top bit, set the
+  next bit, and clear the bottom three bits.
+
+  X25519 guarantees that the output `nP` is frozen. It does not require
+  the input to be frozen; also, it allows the input to be on the twist,
+  and to have small order.
+
+  `crypto_nP_montgomery25519` clears the top bit of `P` before applying
+  the X25519 function. Callers that want the X25519 function on `P` with
+  the top bit set have to reduce modulo 2^255^−19 for themselves.
+
+* `crypto_nG/merged25519`: `crypto_nG_merged25519(nG,n)` reads an
+  integer `n` in the range {0,1,...,2^256^−1} and outputs a frozen
+  integer `nG` modulo 2^255^−19, possibly with the top bit set (i.e.,
+  adding 2^255^) as described below. Both `n` and `nG` are represented
+  as 32-byte arrays in little-endian form. This function takes constant
+  time.
+
+  If the top bit of `n` is clear then `nG` is the Edwards y-coordinate
+  of the `n`th multiple of G, and the top bit is set exactly when the
+  Edwards x-coordinate is odd. Otherwise `nG` is the Montgomery
+  x-coordinate of the (`n`−2^255^)th multiple of G, and the top bit is
+  clear. Here G is the standard Curve25519 base point, which has
+  Montgomery x-coordinate 9, Edwards y-coordinate 4/5, and even Edwards
+  x-coordinate.
+
+* `crypto_nG/montgomery25519`: `crypto_nG_montgomery25519(nG,n)` is
+  the same as `crypto_nP_montgomery(nG,n,G)` where `G` is the array
+  {9,0,0,...,0}. This function takes constant time.
+
+  The point of `crypto_nG` is to save time (using a small table
+  precomputed from `G`) compared to the more general `crypto_nP`. This
+  has the disadvantage of being more complicated, which is particularly
+  important given that lib25519 has not yet been verified, and in any
+  case increases code size noticeably for X25519. There is a `ref`
+  implementation of `crypto_nG` that simply calls `crypto_nP`, and
+  setting sticky bits on the other implementation directories
+  (`chmod +t crypto_nG/montgomery25519/*; chmod -t crypto_nG/montgomery25519/ref`)
+  will force lib25519 to use `ref`.
+
+* `crypto_mGnP/ed25519`: `crypto_mGnP_ed25519(mGnP,m,n,P)` computes
+  `(m mod L)G−(n mod L)P` in Edwards coordinates, where `L` is the prime
+  number 2^252^+27742317777372353535851937790883648493 and `G` is the
+  same standard base point. This function takes time that depends on the
+  inputs.
+
+  The input `m` is an integer in the range {0,1,...,2^256^−1}
+  represented as a 32-byte array in little-endian form. Any `m` outside
+  the range {0,1,...,L−1} triggers a failure, which is reported as
+  described below.
+
+  The input `n` is an integer in the range {0,1,...,2^512^−1}
+  represented as a 64-byte array in little-endian form.
+
+  The input point `P` is represented as a 32-byte array as follows: the
+  (frozen) Edwards y-coordinate of `P` in {0,1,...,2^255^−20} is stored
+  in little-endian form, and then the top bit is set exactly when the
+  (frozen) Edwards x-coordinate of `P` is odd. An input 32-byte array
+  that does not have this form is instead interpreted as the point `P`
+  with Edwards coordinates (...8,26), and triggers a failure, reported
+  as described below.
+
+  The output is a 33-byte array. The first 32 bytes are the output point
+  `(m mod L)G−(n mod L)P`, represented the same way as `P`. The last
+  byte is 1 on success and 0 on failure.
+
+* `crypto_dh/x25519`: `crypto_dh_x25519_keypair(pk,sk)` generates a
+  32-byte X25519 public key `pk` and the corresponding 32-byte secret
+  key `sk`. This function is the composition of `randombytes` to
+  generate `sk` and `crypto_nG_montgomery25519` to generate `pk`.
+
+  `crypto_dh_x25519(k,pk,sk)` generates a 32-byte shared secret `k`
+  given a public key `pk` and a secret key `sk`. This function is the
+  same as `crypto_nP_montgomery25519`.
+
+* `crypto_sign/ed25519`: `crypto_sign_ed25519_keypair(pk,sk)` generates
+  a 32-byte Ed25519 public key `pk` and the corresponding 64-byte secret
+  key `sk`. This function takes constant time.
+
+  `crypto_sign_ed25519(sm,&smlen,m,mlen,sk)` generates an `smlen`-byte
+  signed message `sm` given an `mlen`-byte message `m` and a secret key
+  `sk`. The caller is required to allocate `mlen+64` bytes for `sm`. The
+  function always sets `smlen` to `mlen+64`. This function takes time
+  that depends on `mlen` but not on the other inputs.
+
+  `crypto_sign_ed25519_open(m,&mlen,sm,smlen,pk)` generates an
+  `mlen`-byte message `m` given an `smlen`-byte signed message `sm` and
+  a public key `pk`, and returns 0. However, if `sm` is invalid, this
+  function returns `-1`, sets `mlen` to `-1`, and clears `m`. The caller is
+  required to allocate `smlen` (not just `smlen-64`) bytes for `m`, for
+  example using the same array for `sm` and `m`. This function takes time
+  that depends on its inputs.
+
+lib25519 includes a command-line utility `lib25519-test` that runs some
+tests for each of these primitives, and another utility `lib25519-speed`
+that measures cycle counts for each of these primitives.
+
+The stable lib25519 API functions are built from the above primitives:
+
+* `lib25519_dh_keypair` is `crypto_dh_x25519_keypair`.
+* `lib25519_dh` is `crypto_dh_x25519`.
+* `lib25519_sign_keypair` is `crypto_sign_ed25519_keypair`.
+* `lib25519_sign` is `crypto_sign_ed25519`.
+* `lib25519_sign_open` is `crypto_sign_ed25519_open`.
+
+Some changes are anticipated in the list of primitives, but these API
+functions will remain stable.
+
+As in SUPERCOP and NaCl, message lengths intentionally use `long long`,
+not `size_t`. In lib25519, message lengths are signed.
+
+## Implementations
+
+A single primitive can, and usually does, have multiple implementations.
+Each implementation is in its own subdirectory. The implementations are
+required to have exactly the same input-output behavior, and to some
+extent this is tested, although it is not yet formally verified.
+
+Different implementations typically offer different tradeoffs between
+portability, simplicity, and efficiency. For example,
+`crypto_nP/montgomery25519/ref10` is portable;
+`crypto_nP/montgomery25519/amd64-maax` is faster and less portable.
+
+Each unportable implementation has an `architectures` file. Each line in
+this file identifies a CPU instruction set (and ABI) where the
+implementation works. For example,
+`crypto_nP/montgomery25519/amd64-maax/architectures` has one line
+`amd64 bmi2 adx`, meaning that the implementation works on CPUs that
+have the Intel/AMD 64-bit instruction set with the BMI2 and ADX
+instruction-set extensions. The top-level `compilers` directory shows
+(among other things) the allowed instruction-set names such as `bmi2`.
+
+At run time, lib25519 checks the CPU where it is running, and selects
+an implementation where `architectures` is compatible with that CPU.
+Each primitive makes its own selection once per program startup, using
+the compiler's `ifunc` mechanism. This type of run-time selection means,
+for example, that an `amd64` CPU without AVX2 can share binaries with an
+`amd64` CPU with AVX2. However, correctness requires instruction sets to
+be preserved by migration across cores via the OS kernel, VM migration,
+etc.
+
+The compiler has a `target` mechanism that makes an `ifunc` selection
+based on CPU architectures. Instead of using the `target` mechanism,
+lib25519 uses a more sophisticated mechanism that also accounts for
+benchmarks collected in advance of compilation.
+
+## Compilers
+
+lib25519 tries different C compilers for each implementation. For
+example, `compilers/default` lists the following compilers:
+
+        gcc -Wall -fPIC -fwrapv -O2
+        clang -Wall -fPIC -fwrapv -Qunused-arguments -O2
+
+Sometimes `gcc` produces better code, and sometimes `clang` produces
+better code.
+
+As another example, `compilers/amd64+avx` lists the following compilers:
+
+        gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mtune=sandybridge
+        clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mtune=sandybridge
+
+The `-mavx` option tells these compilers that they are free to use the
+AVX instruction-set extension.
+
+Code compiled using the compilers in `compilers/amd64+avx` will be
+considered at run time by the lib25519 selection mechanism if the
+`supports()` function in `compilers/amd64+avx.c` returns nonzero. This
+function checks whether the run-time CPU supports AVX (and SSE and so on,
+and OSXSAVE with XMM/YMM being saved;
+[https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85100](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85100)
+says that all versions of gcc until 2018 handled this incorrectly in
+`target`). Similar comments apply to other `compilers/*` files.
+
+If some compilers fail (for example, clang is not installed, or the
+compiler version is too old to support the compiler options used in
+lib25519), the lib25519 compilation process will try its best to produce
+a working library using the remaining compilers, even if this means
+lower performance.
+
+## Trimming
+
+By default, to reduce size of the compiled library, the lib25519
+compilation process trims the library down to the implementations that
+are selected by lib25519's selection mechanism (across all CPUs; the
+library remains portable, not tied to the compilation CPU).
+
+This trimming is handled at link time rather than compile time to
+increase the chance that, even if some implementations are broken by
+compiler "upgrades", the library will continue to build successfully.
+
+To avoid this trimming, pass the `--notrim` option to `./configure`.
+All implementations that compile are then included in the library,
+tested by `lib2519-test`, and measured by `lib25519-speed`. You'll want
+to avoid trimming if you're adding new instruction sets or new
+implementations (see below), so that you can run tests and benchmarks of
+code that isn't selected yet.
+
+## How to recompile after changes
+
+If you make changes in the lib25519 source directory, you have to run
+`./configure` again to repopulate the build directory. Simply running
+`make` again doesn't suffice.
+
+By default, `./configure` cleans the build directory first, so `make`
+will recompile everything. This can be on the scale of seconds if you
+have enough cores, but maybe you're developing on a slower machine. Two
+options are currently available to accelerate the edit-compile cycle:
+
+   * There is an experimental `--noclean` option to `./configure` that,
+     for some simple types of changes, can produce a successful build
+     without cleaning.
+
+   * You can disable the implementations you're not using by setting
+     sticky bits on the source directories for those implementations:
+     e.g., `chmod +t crypto_nG/*/*avx2*`.
+
+Make sure to reenable all implementations and do a full clean build if
+you're collecting data to add to the source `benchmarks` directory.
+
+## How to add new instruction sets
+
+Adding another file `compilers/amd64+foo`, along with a `supports()`
+implementation in `compilers/amd64+foo.c`, will support a new
+instruction set. Do not assume that the new `foo` instruction set
+implies support for older instruction sets (the idea of "levels" of
+instruction sets); instead make sure to include the older instruction
+sets in `+` tags, as illustrated by
+`compilers/amd64+avx+bmi2+avx2+adx+avx512f+avx512vl+avx512ifma`.
+
+In the compiler options, always make sure to include `-fPIC` to support
+shared libraries, and `-fwrapv` to switch to a slightly less dangerous
+version of C.
+
+The `foo` tags don't have to be instruction sets. For example, if a CPU
+has the same instruction set but wants different optimizations because
+of differences in instruction timings, you can make a tag for those
+optimizations, using, e.g., CPU IDs or benchmarks in the corresponding
+`supports()` function to decide whether to enable those optimizations.
+Benchmarks tend to be more future-proof than a list of CPU IDs, but the
+time taken for benchmarks at program startup has to be weighed against
+the subsequent speedup from the resulting optimizations.
+
+To see how well lib25519 performs with the new compilers, run
+`lib25519-speed` on the target machine and look for the `foo` lines in
+the output. If the new performance is better than the performance shown
+on the `selected` lines:
+
+   * Copy the `lib25519-speed` output into a file on the `benchmarks`
+     directory, typically named after the hostname of the target
+     machine.
+
+   * Run `./prioritize` in the top-level directory to create `priority`
+     files. These files tell lib25519 which implementations to select
+     for any given architecture.
+
+   * Reconfigure (again with `--notrim`), recompile, rerun
+     `lib25519-test`, and rerun `lib25519-speed` to check that the
+     `default` lines now use the `foo` compiler.
+
+If the `foo` implementation is outperformed by other implementations,
+then these steps don't help except for documenting this fact. The same
+implementation might turn out to be useful for subsequent `foo` CPUs.
+
+## How to add new implementations
+
+Taking full advantage of the `foo` instruction set usually requires
+writing new implementations. Sometimes there are also ideas for taking
+better advantage of existing instruction sets.
+
+Structurally, adding a new implementation of a primitive is a simple
+matter of adding a new subdirectory with the code for that
+implementation. Most of the work is optimizing the use of `foo`
+intrinsics in `.c` files or `foo` instructions in `.S` files. Make sure
+to include an `architectures` file saying, e.g., `amd64 avx2 foo`.
+
+Names of implementation directories can use letters, digits, dashes, and
+underscores. Do not use two implementation names that are the same when
+dashes and underscores are removed.
+
+All `.c` and `.S` files in the implementation directory are compiled and
+linked. There is no need to edit a separate list of these files. You can
+also use `.h` files via the C preprocessor.
+
+If an implementation is actually more restrictive than indicated in
+`architectures` then the resulting compiled library will fail on some
+machines (although perhaps that implementation will not be used by
+default). Putting unnecessary restrictions into `architectures` will not
+create such failures, but can unnecessarily limit performance.
+
+Some, but not all, mistakes in `architectures` will produce warnings
+from the `checkinsns` script that runs automatically when lib25519 is
+compiled. Running the `lib25519-test` program tries all implementations,
+but only on the CPU where `lib25519-test` is being run, and `lib25519-test`
+does not guarantee code coverage: for example, other message lengths
+being signed could involve other code paths.
+
+`amd64` implies little-endian, and implies architectural support for
+unaligned loads and stores. Beware, however, that the Intel/AMD
+vectorized `load`/`store` intrinsics (and the underlying `movdqa`
+instruction) require alignment; if in doubt, use `loadu`/`storeu` (and
+`movdqu`). The `lib25519-test` program checks unaligned inputs and
+outputs, but can miss issues with unaligned stack variables.
+
+To test your implementation, compile everything, check for compiler
+warnings and errors, run `lib25519-test` (or just `lib25519-test nG` to
+test a `crypto_nG` implementation), and check for a line saying `all
+tests succeeded`. To use AddressSanitizer (for catching, at run time,
+buffer overflows in C code), add `-fsanitize=address` to the `gcc` and
+`clang` lines in `compilers/*`.
+
+To see the performance of your implementation, run `lib25519-speed`.
+If the new performance is better than the performance shown on the
+`default` lines, follow the same steps as for a new instruction set:
+copy the `lib25519-speed` output into a file on the `benchmarks`
+directory; run `./prioritize` in the top-level directory to create
+`priority` files; reconfigure (again with `--notrim`); recompile; rerun
+`lib25519-test`; rerun `lib25519-speed`; check that the `default` lines
+now use the new implementation.
+
+## How to handle namespacing
+
+As in SUPERCOP and NaCl, to call `crypto_hash_sha512()`, you have to
+include `crypto_hash_sha512.h`; but to write an implementation of
+`crypto_hash_sha512()`, you have to instead include `crypto_hash.h` and
+define `crypto_hash`. Similar comments apply to other primitives.
+
+The function name that's actually linked might end up as, e.g.,
+`lib25519_hash_sha512_blocksplusavx_C2_hash` where `blocksplusavx`
+indicates the implementation and `C2` indicates the compiler. Don't try
+to build this name into your implementation.
+
+If you have another global symbol `x` (for example, a non-`static`
+function in a `.c` file, or a non-`static` variable outside functions in
+a `.c` file), you have to replace it with `CRYPTO_NAMESPACE(x)`, for
+example with `#define x CRYPTO_NAMESPACE(x)`.
+
+For global symbols in `.S` files and `shared-*.c` files, use
+`CRYPTO_SHARED_NAMESPACE` instead of `CRYPTO_NAMESPACE`. For `.S` files
+that define both `x` and `_x` to handle platforms where `x` in C is `_x`
+in assembly, use `CRYPTO_SHARED_NAMESPACE(x)` and
+`_CRYPTO_SHARED_NAMESPACE(x)`; `CRYPTO_SHARED_NAMESPACE(_x)` is not
+sufficient.
+
+lib25519 includes a mechanism to recognize files that are copied across
+implementations (possibly of different primitives) and to unify those
+into a file compiled only once, reducing the overall size of the
+compiled library and possibly improving cache utilization. To request
+this mechanism, include a line `// linker define x` for any global
+symbol `x` defined in the file, and a line `// linker use x` for any
+global symbol `x` used in the file from the same implementation (not
+`crypto_*` subroutines that you're calling, `randombytes`, etc.). This
+mechanism tries very hard, perhaps too hard, to avoid improperly
+unifying files: for example, even a slight difference in a `.h` file
+included by a file defining a used symbol will disable the mechanism.
+
+Typical namespacing mistakes will produce either linker failures or
+warnings from the `checknamespace` script that runs automatically when
+lib25519 is compiled.
@@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
@@ -0,0 +1,8 @@
+default:
+	cd build && $(MAKE)
+
+install:
+	cd build && $(MAKE) install
+
+clean:
+	cd build && $(MAKE) clean
@@ -0,0 +1,28 @@
+lib25519 is a microlibrary for the X25519 encryption system and the
+Ed25519 signature system, both of which use the Curve25519 elliptic
+curve. Curve25519 is the fastest curve in TLS 1.3, and the only curve in
+Wireguard, Signal, and many other applications (see Nicolai Brown's page
+https://ianix.com/pub/curve25519-deployment.html).
+
+lib25519 has a very simple stateless API based on the SUPERCOP API, with
+wire-format inputs and outputs, providing functions that directly match
+the central cryptographic operations in X25519 and Ed25519:
+
+    lib25519_dh_keypair(pk,sk): X25519 key generation
+    lib25519_dh(k,pk,sk): shared-secret generation
+
+    lib25519_sign_keypair(pk,sk): Ed25519 key generation
+    lib25519_sign(sm,&smlen,m,mlen,sk): signing
+    lib25519_sign_open(m,&mlen,sm,smlen,pk): verification + message recovery
+
+Internally, lib25519 includes implementations designed for performance
+on various CPUs, implementations designed to work portably across CPUs,
+and automatic run-time selection of implementations.
+
+lib25519 is intended to be called by larger multi-function libraries,
+including libraries in other languages via FFI. The idea is that
+lib25519 will take responsibility for the details of X25519/Ed25519
+computation, including optimization, timing-attack protection, and
+eventually verification, freeing up the calling libraries to concentrate
+on application-specific needs such as protocol integration. Applications
+can also call lib25519 directly.
@@ -0,0 +1,87 @@
+Security model: X25519 is designed to be strong as a component of
+various well-known "hashed DH" applications, and in particular is
+designed to make the CDH problem difficult with respect to the standard
+base. Ed25519 is designed to provide EUF-CMA, the standard notion of
+unforgeability of a signature system under chosen-message attacks.
+However, some applications need other security notions that are not
+provided by X25519 and Ed25519.
+
+Security level: X25519 and Ed25519 are more difficult to break by any
+known attacks than a typical 128-bit cipher. They have an extremely
+stable security track record, with two decades of research changing
+security levels by only a fraction of a bit. They also proactively avoid
+various potential risks. However, large quantum computers will break
+both X25519 and Ed25519.
+
+Software verification: lib25519 is intended to become a central target
+for verification of full functional correctness of implementations of
+X25519 and Ed25519. However, only certain portions have been verified so
+far, and at this point the code should be presumed to have critical
+bugs.
+
+Timing attacks: lib25519 is designed to avoid all data flow from secret
+data to memory addresses and branch conditions. Fully protecting the
+user against timing attacks requires addressing more issues, such as the
+following:
+
+   * Other common instructions used by lib25519 take variable time on
+     some CPUs. In particular, there are some embedded CPUs with
+     variable-time multipliers.
+
+   * Many CPUs include dynamic frequency-selection mechanisms such as
+     Turbo Boost, exposing power information via timing information.
+     Fortunately, these CPUs are normally shipped with simple options to
+     disable Turbo Boost etc., closing this leak; unfortunately, Turbo
+     Boost is enabled by default on CPUs that support it.
+
+   * Cryptographic keys are normally handled by cryptographic software,
+     but other user secrets are handled by many different pieces of
+     software.
+
+See https://timing.attacks.cr.yp.to for a timing-attack survey and many
+references.
+
+Speculative-execution attacks: Some countermeasures against
+speculative-execution attacks are planned but are not included in the
+current version of lib25519. Full protection again requires addressing
+issues at other system layers.
+
+Further side-channel attacks: Even if all legitimate user sensors are
+successfully kept isolated from attackers, attackers can set up their
+own power sensors, electromagnetic sensors, acoustic sensors, etc.
+Keeping cryptographic operations physically separated from sensors tends
+to make such attacks much more expensive but is often infeasible.
+"Masking" cryptographic computations seems to help and can be
+affordable, although the security of masking is difficult to evaluate
+and there are many broken masked implementations. Currently lib25519
+does not include any masked implementations, so presumably it is easily
+breakable by power attacks in environments where attackers can see power
+consumption.
+
+Further attacks: lib25519 creates an Ed25519 signing nonce as a hash of
+the message, a long-term secret, and new randomness (specifically, the
+nonce is a keyed hash of the message, where the key is the hash of the
+long-term secret and new randomness). The literature identifies various
+advantages and disadvantages of including these hash inputs:
+
+   * Including the message and a long-term secret protects against
+     signing-time RNG failures. This is a standard feature of Ed25519
+     signers.
+
+   * To the extent that the RNG works, including new randomness has the
+     advantage of stopping (e.g.) fault attacks that rely on a nonce
+     being reused for multiple signatures of the same message.
+
+   * Including new randomness has the disadvantage of requiring state
+     for the RNG. However, lib25519 runs within an OS that in any case
+     maintains state and provides an RNG.
+
+   * Including new randomness also has the disadvantage of interfering
+     with the use of test vectors. This disadvantage does not apply to
+     lib25519: lib25519's test vectors already handle randomness.
+
+lib25519 includes a few further steps that could be useful in stopping
+fault attacks (for example, signature verification internally converts
+invalid public keys to the key (...,26), which does not have a known
+discrete logarithm), but in general lib25519 should be presumed
+breakable by fault attacks.
@@ -0,0 +1,17 @@
+track history more precisely in documentation for individual source files
+consider symlinks from files in build tree to source tree (via a build/source symlink so build can link elsewhere)
+allow shared-* for API functions (requires tweaking dispatch)
+speedups for more architectures
+speedups for more microarchitectures
+consider faster PRF for the keyed hash giving the nonce
+merge subroutines in source to the extent possible
+scan for and remove any unused functions and files
+restructure for more merging at object-code level
+sort object files (for, e.g., improved cache utilization)
+optionally allow post-installation patching of current cpu as an exceptional cpuid
+  (based on benchmarks and, with more CPU time, full functionality tests)
+dispatch: eliminate, e.g., avx2 if avx is higher priority
+speed up dispatch cpuid tests (lazy evaluation, merging cpuid calls)
+randombytes: support getrandom, getentropy
+verify constbranch, constindex
+full functional verification
@@ -0,0 +1,51 @@
+crypto_verify/32
+#define crypto_verify_32_BYTES 32
+int crypto_verify(const unsigned char *,const unsigned char *);
+
+crypto_hashblocks/sha512 f0bc623a9033f9f648336540e11e85be21aeb60905c7d8808d10ea20b39d58d1 f1a2c46c9ce7fa4cd22f180907d77b6f7189badef4b9a1b5284d6fb9db859b76
+#define crypto_hashblocks_sha512_STATEBYTES 64
+#define crypto_hashblocks_sha512_BLOCKBYTES 128
+int crypto_hashblocks(unsigned char *,const unsigned char *,long long);
+
+crypto_hash/sha512 8220572f58bd4730be165c9739d8d4b0fd2e0229dbe01e25b4aed23f00f23b70 c1e322b7cbfc941260c5508967ba05bce22eeee94d425e708b7c3301ea1d5e2e
+#define crypto_hash_sha512_BYTES 64
+void crypto_hash(unsigned char *,const unsigned char *,long long);
+
+crypto_pow/inv25519 ad2062946e82718da820226504991a85c5fe56bdbff959c1313f837ee13b37be 59b3045a01e1fca2a86a0280aee8b985c5e040afdc0d3e85ed87eb97a46a4dd6 
+#define crypto_pow_inv25519_BYTES 32
+void crypto_pow(unsigned char *,const unsigned char *);
+
+crypto_nP/montgomery25519 b861d66109b42359e5994ed57ae566827c345b65a9d0671700320b82888397ec 740924011f3448f65299f61b087f74a6eb9651a4203dfbf621d2bec54e149405
+#define crypto_nP_montgomery25519_SCALARBYTES 32
+#define crypto_nP_montgomery25519_POINTBYTES 32
+void crypto_nP(unsigned char *,const unsigned char *,const unsigned char *);
+
+crypto_nG/merged25519 a4e761839798a07817484e97605bd63215b4938934ed9ce01935bbced48155bc 0a01c09fc8a8c7e8c18f841b2e1b2da9c156868737d194d223b03531cf2db731 
+#define crypto_nG_merged25519_SCALARBYTES 32
+#define crypto_nG_merged25519_POINTBYTES 32
+crypto_nG/montgomery25519 5c8a5d8b32e3d26b33071779ce9191095d7bd4ab3bb6a40b68976e41a98cfc3b 2becc8cd065820fcf82e53a03c5b5235582480fc11d072f2bd15153aebd4e057
+#define crypto_nG_montgomery25519_SCALARBYTES 32
+#define crypto_nG_montgomery25519_POINTBYTES 32
+void crypto_nG(unsigned char *,const unsigned char *);
+
+crypto_mGnP/ed25519 dc80be44fb0d482c5ae430779e76fe612c53fcd9e5847254bf27ab34e90745f4 9e1a3b7015c8fdb12763fd88494f5bfe9e2565ead4d3407d5ecf7ff6ca24c1d0
+#define crypto_mGnP_ed25519_MBYTES 32
+#define crypto_mGnP_ed25519_NBYTES 64
+#define crypto_mGnP_ed25519_PBYTES 32
+#define crypto_mGnP_ed25519_OUTPUTBYTES 33
+void crypto_mGnP(unsigned char *,const unsigned char *,const unsigned char *,const unsigned char *);
+
+crypto_dh/x25519 2c8a73ec86d5d4c4bc838f49cfd78c87b60b534ae6fff59ce3bea0c32cdc1450 b09016b3a1371786b46a183085133338159e623c5eb9cbc5eaa4f8b62d6c5aea
+#define crypto_dh_x25519_SECRETKEYBYTES 32
+#define crypto_dh_x25519_PUBLICKEYBYTES 32
+#define crypto_dh_x25519_BYTES 32
+void crypto_dh_keypair(unsigned char *,unsigned char *);
+void crypto_dh(unsigned char *,const unsigned char *,const unsigned char *);
+
+crypto_sign/ed25519 ce11fd7c1eac4dd0bc5eec49b26ad1e91aef696fae50ce377dbd806dc394da01 2ed857f17c917a8185e6c296303a11772ae45683a5e7cb5b095489bad65fffde
+#define crypto_sign_ed25519_SECRETKEYBYTES 64
+#define crypto_sign_ed25519_PUBLICKEYBYTES 32
+#define crypto_sign_ed25519_BYTES 64
+void crypto_sign_keypair(unsigned char *,unsigned char *);
+void crypto_sign(unsigned char *,long long *,const unsigned char *,long long,const unsigned char *);
+int crypto_sign_open(unsigned char *,long long *,const unsigned char *,long long,const unsigned char *);
@@ -0,0 +1,338 @@
+#!/usr/bin/env python3
+
+output = r'''/* WARNING: auto-generated (by autogen-speed); do not edit */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include "cpucycles.h" /* -lcpucycles */
+#include "lib25519.h" /* -l25519 */
+#include "randombytes.h" /* -lrandombytes_kernel */
+
+static const char *targeto = 0;
+static const char *targetp = 0;
+static const char *targeti = 0;
+
+#include "limits.inc"
+
+static unsigned char *alignedcalloc(unsigned long long len)
+{
+  unsigned char *x = (unsigned char *) calloc(1,len + 128);
+  if (!x) abort();
+  /* will never deallocate so shifting is ok */
+  x += 63 & (-(unsigned long) x);
+  return x;
+}
+
+#define TIMINGS 15
+static long long t[TIMINGS+1];
+
+static void t_print(const char *op,long long impl,long long len)
+{
+  long long median = 0;
+
+  printf("%s",op);
+  if (impl >= 0)
+    printf(" %lld",impl);
+  else
+    printf(" selected");
+  printf(" %lld",len);
+  for (long long i = 0;i < TIMINGS;++i)
+    t[i] = t[i+1]-t[i];
+  for (long long j = 0;j < TIMINGS;++j) {
+    long long belowj = 0;
+    long long abovej = 0;
+    for (long long i = 0;i < TIMINGS;++i) if (t[i] < t[j]) ++belowj;
+    for (long long i = 0;i < TIMINGS;++i) if (t[i] > t[j]) ++abovej;
+    if (belowj*2 < TIMINGS && abovej*2 < TIMINGS) {
+      median = t[j];
+      break;
+    }
+  }
+  printf(" %lld ",median);
+  for (long long i = 0;i < TIMINGS;++i)
+    printf("%+lld",t[i]-median);
+  printf("\n");
+  fflush(stdout);
+}
+
+#define MAXTEST_BYTES 65536
+
+static void measure_cpucycles(void)
+{
+  printf("cpucycles selected persecond %lld\n",cpucycles_persecond());
+  printf("cpucycles selected implementation %s\n",cpucycles_implementation());
+
+  for (long long i = 0;i <= TIMINGS;++i)
+    t[i] = cpucycles();
+  t_print("cpucycles",-1,0);
+}
+
+static void measure_randombytes(void)
+{
+  unsigned char *m = alignedcalloc(MAXTEST_BYTES);
+  long long mlen = 0;
+
+  while (mlen < MAXTEST_BYTES) {
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      randombytes(m,mlen);
+    }
+    t_print("randombytes",-1,mlen);
+    mlen += 1+mlen/2;
+  }
+}
+'''
+
+# XXX: integrate todo into api
+todo = (
+  ('verify',(
+    ('x','lib25519_verify_BYTES'),
+    ('y','lib25519_verify_BYTES'),
+  ),(
+    ('crypto_verify','x','y'),
+  )),
+  ('hashblocks',(
+    ('h','lib25519_hashblocks_STATEBYTES'),
+    ('m','MAXTEST_BYTES'),
+    ('mlen',None),
+  ),(
+    ('crypto_hashblocks','h','m','mlen'),
+  )),
+  ('hash',(
+    ('h','lib25519_hash_BYTES'),
+    ('m','MAXTEST_BYTES'),
+    ('mlen',None),
+  ),(
+    ('crypto_hash','h','m','mlen'),
+  )),
+  ('pow',(
+    ('n','lib25519_pow_BYTES'),
+    ('ne','lib25519_pow_BYTES'),
+  ),(
+    ('crypto_pow','ne','n'),
+  )),
+  ('nP',(
+    ('n','lib25519_nP_SCALARBYTES'),
+    ('P','lib25519_nP_POINTBYTES'),
+    ('nP','lib25519_nP_POINTBYTES'),
+  ),(
+    ('crypto_nP','nP','n','P'),
+  )),
+  ('nG',(
+    ('n','lib25519_nP_SCALARBYTES'),
+    ('nG','lib25519_nP_POINTBYTES'),
+  ),(
+    ('crypto_nG','nG','n'),
+  )),
+  ('mGnP',(
+    ('mGnP','lib25519_mGnP_OUTPUTBYTES'),
+    ('m','lib25519_mGnP_MBYTES'),
+    ('n','lib25519_mGnP_NBYTES'),
+    ('P','lib25519_mGnP_PBYTES'),
+  ),(
+    ('crypto_mGnP','mGnP','m','n','P'),
+  )),
+  ('dh',(
+    ('pka','lib25519_dh_PUBLICKEYBYTES'),
+    ('ska','lib25519_dh_SECRETKEYBYTES'),
+    ('pkb','lib25519_dh_PUBLICKEYBYTES'),
+    ('skb','lib25519_dh_SECRETKEYBYTES'),
+    ('ka','lib25519_dh_BYTES'),
+  ),(
+    ('crypto_dh_keypair','pka','ska'),
+    ('crypto_dh_keypair','pkb','skb'),
+    ('crypto_dh','ka','pkb','ska'),
+  )),
+  ('sign',(
+    ('pk','lib25519_sign_PUBLICKEYBYTES'),
+    ('sk','lib25519_sign_SECRETKEYBYTES'),
+    ('m','MAXTEST_BYTES+lib25519_sign_BYTES'),
+    ('sm','MAXTEST_BYTES+lib25519_sign_BYTES'),
+    ('m2','MAXTEST_BYTES+lib25519_sign_BYTES'),
+    ('mlen',None),
+    ('smlen',None),
+    ('m2len',None),
+  ),(
+    ('crypto_sign_keypair','pk','sk'),
+    ('crypto_sign','sm','&smlen','m','mlen','sk'),
+    ('crypto_sign_open','m2','&m2len','sm','smlen','pk'),
+  )),
+)
+
+operations = []
+primitives = {}
+sizes = {}
+exports = {}
+prototypes = {}
+
+with open('api') as f:
+  for line in f:
+    line = line.strip()
+    if line.startswith('crypto_'):
+      x = line.split()
+      x = x[0].split('/')
+      assert len(x) == 2
+      o = x[0].split('_')[1]
+      if o not in operations: operations += [o]
+      p = x[1]
+      if o not in primitives: primitives[o] = []
+      primitives[o] += [p]
+      continue
+    if line.startswith('#define '):
+      x = line.split(' ')
+      x = x[1].split('_')
+      assert len(x) == 4
+      assert x[0] == 'crypto'
+      o = x[1]
+      p = x[2]
+      if (o,p) not in sizes: sizes[o,p] = ''
+      sizes[o,p] += line+'\n'
+      continue
+    if line.endswith(');'):
+      fun,args = line[:-2].split('(')
+      rettype,fun = fun.split()
+      fun = fun.split('_')
+      o = fun[1]
+      assert fun[0] == 'crypto'
+      if o not in exports: exports[o] = []
+      exports[o] += ['_'.join(fun[1:])]
+      if o not in prototypes: prototypes[o] = []
+      prototypes[o] += [(rettype,fun,args)]
+
+for t in todo:
+  o,vars,benches = t
+
+  for p in primitives[o]:
+
+    output += '\n'
+    output += 'static void measure_%s_%s(void)\n' % (o,p)
+    output += '{\n'
+
+    output += '  if (targeto && strcmp(targeto,"%s")) return;\n' % o
+    output += '  if (targetp && strcmp(targetp,"%s")) return;\n' % p
+
+    varsize = {}
+    for v,size in vars:
+      if size is None:
+        output += '  long long %s;\n' % v
+      else:
+        size = size.replace('lib25519_'+o,'lib25519_'+o+'_'+p)
+        output += '  unsigned char *%s = alignedcalloc(%s);\n' % (v,size)
+        varsize[v] = size
+    output += '\n'
+  
+    output += '  for (long long impl = -1;impl < lib25519_numimpl_%s_%s();++impl) {\n' % (o,p)
+    for rettype,fun,args in prototypes[o]:
+      output += '    %s (*%s)(%s);\n' % (rettype,'_'.join(fun),args)
+
+    output += '    if (targeti && strcmp(targeti,lib25519_dispatch_%s_%s_implementation(impl))) continue;\n' % (o,p)
+
+    output += '    if (impl >= 0) {\n'
+    for rettype,fun,args in prototypes[o]:
+      f2 = ['lib25519','dispatch',o,p]+fun[2:]
+      output += '      %s = %s(impl);\n' % ('_'.join(fun),'_'.join(f2))
+    output += r'      printf("%s_%s %%lld implementation %%s compiler %%s\n",impl,lib25519_dispatch_%s_%s_implementation(impl),lib25519_dispatch_%s_%s_compiler(impl));' % (o,p,o,p,o,p)
+    output += '\n'
+    output += '    } else {\n'
+    for rettype,fun,args in prototypes[o]:
+      f2 = ['lib25519',o,p]+fun[2:]
+      output += '      %s = %s;\n' % ('_'.join(fun),'_'.join(f2))
+    output += r'      printf("%s_%s selected implementation %%s compiler %%s\n",lib25519_%s_%s_implementation(),lib25519_%s_%s_compiler());' % (o,p,o,p,o,p)
+    output += '\n'
+    output += '    }\n'
+
+    for v,size in vars:
+      if size is not None:
+        size = size.replace('lib25519_'+o,'lib25519_'+o+'_'+p)
+        output += '    randombytes(%s,%s);\n' % (v,size)
+
+    alreadybenched = set()
+    alreadybenched.add('assert')
+
+    for b in benches:
+      if b[0] in alreadybenched:
+        output += '    %s(%s);\n' % (b[0],','.join(b[1:]))
+        continue
+
+      fun = b[0].split('_')
+      shortfun = '_'.join([o,p]+fun[2:])
+      alreadybenched.add(b[0])
+
+      if 'mlen' in b[1:] or 'smlen' in b[1:]:
+        output += '    mlen = 0;\n'
+        output += '    while (mlen <= MAXTEST_BYTES) {\n'
+        output += '      randombytes(m,mlen);\n'
+
+        if shortfun == 'sign_ed25519_open': # XXX: put this into todo
+          output += '      lib25519_sign(sm,&smlen,m,mlen,sk);\n'
+
+        output += '      for (long long i = 0;i <= TIMINGS;++i) {\n'
+        output += '        t[i] = cpucycles();\n'
+        output += '        %s(%s);\n' % (b[0],','.join(b[1:]))
+        output += '      }\n'
+        output += '      t_print("%s",impl,mlen);\n' % (shortfun)
+
+        if shortfun == 'sign_ed25519_open': # XXX: put this into todo
+          output += '      /* this is, in principle, not a test program */\n'
+          output += '      /* but some checks here help validate the data flow above */\n'
+          output += '      assert(m2len == mlen);\n'
+          output += '      assert(!memcmp(m,m2,mlen));\n'
+
+        if o == 'sign': # XXX: put this into todo
+          output += '      mlen += 1+mlen/4;\n'
+        else:
+          output += '      mlen += 1+mlen/2;\n'
+        output += '    }\n'
+      else:
+        output += '    for (long long i = 0;i <= TIMINGS;++i) {\n'
+        output += '      t[i] = cpucycles();\n'
+        output += '      %s(%s);\n' % (b[0],','.join(b[1:]))
+        output += '    }\n'
+        output += '    t_print("%s",impl,%s);\n' % (shortfun,varsize[b[1]])
+
+    output += '  }\n'
+    output += '}\n'
+
+output += r'''
+#include "print_cpuid.inc"
+
+int main(int argc,char **argv)
+{
+  printf("lib25519 version %s\n",lib25519_version);
+  printf("lib25519 arch %s\n",lib25519_arch);
+  print_cpuid();
+
+  if (*argv) ++argv;
+  if (*argv) {
+    targeto = *argv++;
+    if (*argv) {
+      targetp = *argv++;
+      if (*argv) {
+        targeti = *argv++;
+      }
+    }
+  }
+
+  measure_cpucycles();
+  measure_randombytes();
+  limits();
+'''
+
+for t in todo:
+  o,vars,benches = t
+  for p in primitives[o]:
+    output += '  measure_%s_%s();\n' % (o,p)
+
+output += r'''
+  return 0;
+}
+'''
+
+with open('command/lib25519-speed.c','w') as f:
+  f.write(output)
@@ -0,0 +1,466 @@
+/* WARNING: auto-generated (by autogen-speed); do not edit */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include "cpucycles.h" /* -lcpucycles */
+#include "lib25519.h" /* -l25519 */
+#include "randombytes.h" /* -lrandombytes_kernel */
+
+static const char *targeto = 0;
+static const char *targetp = 0;
+static const char *targeti = 0;
+
+#include "limits.inc"
+
+static unsigned char *alignedcalloc(unsigned long long len)
+{
+  unsigned char *x = (unsigned char *) calloc(1,len + 128);
+  if (!x) abort();
+  /* will never deallocate so shifting is ok */
+  x += 63 & (-(unsigned long) x);
+  return x;
+}
+
+#define TIMINGS 15
+static long long t[TIMINGS+1];
+
+static void t_print(const char *op,long long impl,long long len)
+{
+  long long median = 0;
+
+  printf("%s",op);
+  if (impl >= 0)
+    printf(" %lld",impl);
+  else
+    printf(" selected");
+  printf(" %lld",len);
+  for (long long i = 0;i < TIMINGS;++i)
+    t[i] = t[i+1]-t[i];
+  for (long long j = 0;j < TIMINGS;++j) {
+    long long belowj = 0;
+    long long abovej = 0;
+    for (long long i = 0;i < TIMINGS;++i) if (t[i] < t[j]) ++belowj;
+    for (long long i = 0;i < TIMINGS;++i) if (t[i] > t[j]) ++abovej;
+    if (belowj*2 < TIMINGS && abovej*2 < TIMINGS) {
+      median = t[j];
+      break;
+    }
+  }
+  printf(" %lld ",median);
+  for (long long i = 0;i < TIMINGS;++i)
+    printf("%+lld",t[i]-median);
+  printf("\n");
+  fflush(stdout);
+}
+
+#define MAXTEST_BYTES 65536
+
+static void measure_cpucycles(void)
+{
+  printf("cpucycles selected persecond %lld\n",cpucycles_persecond());
+  printf("cpucycles selected implementation %s\n",cpucycles_implementation());
+
+  for (long long i = 0;i <= TIMINGS;++i)
+    t[i] = cpucycles();
+  t_print("cpucycles",-1,0);
+}
+
+static void measure_randombytes(void)
+{
+  unsigned char *m = alignedcalloc(MAXTEST_BYTES);
+  long long mlen = 0;
+
+  while (mlen < MAXTEST_BYTES) {
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      randombytes(m,mlen);
+    }
+    t_print("randombytes",-1,mlen);
+    mlen += 1+mlen/2;
+  }
+}
+
+static void measure_verify_32(void)
+{
+  if (targeto && strcmp(targeto,"verify")) return;
+  if (targetp && strcmp(targetp,"32")) return;
+  unsigned char *x = alignedcalloc(lib25519_verify_32_BYTES);
+  unsigned char *y = alignedcalloc(lib25519_verify_32_BYTES);
+
+  for (long long impl = -1;impl < lib25519_numimpl_verify_32();++impl) {
+    int (*crypto_verify)(const unsigned char *,const unsigned char *);
+    if (targeti && strcmp(targeti,lib25519_dispatch_verify_32_implementation(impl))) continue;
+    if (impl >= 0) {
+      crypto_verify = lib25519_dispatch_verify_32(impl);
+      printf("verify_32 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_verify_32_implementation(impl),lib25519_dispatch_verify_32_compiler(impl));
+    } else {
+      crypto_verify = lib25519_verify_32;
+      printf("verify_32 selected implementation %s compiler %s\n",lib25519_verify_32_implementation(),lib25519_verify_32_compiler());
+    }
+    randombytes(x,lib25519_verify_32_BYTES);
+    randombytes(y,lib25519_verify_32_BYTES);
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      crypto_verify(x,y);
+    }
+    t_print("verify_32",impl,lib25519_verify_32_BYTES);
+  }
+}
+
+static void measure_hashblocks_sha512(void)
+{
+  if (targeto && strcmp(targeto,"hashblocks")) return;
+  if (targetp && strcmp(targetp,"sha512")) return;
+  unsigned char *h = alignedcalloc(lib25519_hashblocks_sha512_STATEBYTES);
+  unsigned char *m = alignedcalloc(MAXTEST_BYTES);
+  long long mlen;
+
+  for (long long impl = -1;impl < lib25519_numimpl_hashblocks_sha512();++impl) {
+    int (*crypto_hashblocks)(unsigned char *,const unsigned char *,long long);
+    if (targeti && strcmp(targeti,lib25519_dispatch_hashblocks_sha512_implementation(impl))) continue;
+    if (impl >= 0) {
+      crypto_hashblocks = lib25519_dispatch_hashblocks_sha512(impl);
+      printf("hashblocks_sha512 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_hashblocks_sha512_implementation(impl),lib25519_dispatch_hashblocks_sha512_compiler(impl));
+    } else {
+      crypto_hashblocks = lib25519_hashblocks_sha512;
+      printf("hashblocks_sha512 selected implementation %s compiler %s\n",lib25519_hashblocks_sha512_implementation(),lib25519_hashblocks_sha512_compiler());
+    }
+    randombytes(h,lib25519_hashblocks_sha512_STATEBYTES);
+    randombytes(m,MAXTEST_BYTES);
+    mlen = 0;
+    while (mlen <= MAXTEST_BYTES) {
+      randombytes(m,mlen);
+      for (long long i = 0;i <= TIMINGS;++i) {
+        t[i] = cpucycles();
+        crypto_hashblocks(h,m,mlen);
+      }
+      t_print("hashblocks_sha512",impl,mlen);
+      mlen += 1+mlen/2;
+    }
+  }
+}
+
+static void measure_hash_sha512(void)
+{
+  if (targeto && strcmp(targeto,"hash")) return;
+  if (targetp && strcmp(targetp,"sha512")) return;
+  unsigned char *h = alignedcalloc(lib25519_hash_sha512_BYTES);
+  unsigned char *m = alignedcalloc(MAXTEST_BYTES);
+  long long mlen;
+
+  for (long long impl = -1;impl < lib25519_numimpl_hash_sha512();++impl) {
+    void (*crypto_hash)(unsigned char *,const unsigned char *,long long);
+    if (targeti && strcmp(targeti,lib25519_dispatch_hash_sha512_implementation(impl))) continue;
+    if (impl >= 0) {
+      crypto_hash = lib25519_dispatch_hash_sha512(impl);
+      printf("hash_sha512 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_hash_sha512_implementation(impl),lib25519_dispatch_hash_sha512_compiler(impl));
+    } else {
+      crypto_hash = lib25519_hash_sha512;
+      printf("hash_sha512 selected implementation %s compiler %s\n",lib25519_hash_sha512_implementation(),lib25519_hash_sha512_compiler());
+    }
+    randombytes(h,lib25519_hash_sha512_BYTES);
+    randombytes(m,MAXTEST_BYTES);
+    mlen = 0;
+    while (mlen <= MAXTEST_BYTES) {
+      randombytes(m,mlen);
+      for (long long i = 0;i <= TIMINGS;++i) {
+        t[i] = cpucycles();
+        crypto_hash(h,m,mlen);
+      }
+      t_print("hash_sha512",impl,mlen);
+      mlen += 1+mlen/2;
+    }
+  }
+}
+
+static void measure_pow_inv25519(void)
+{
+  if (targeto && strcmp(targeto,"pow")) return;
+  if (targetp && strcmp(targetp,"inv25519")) return;
+  unsigned char *n = alignedcalloc(lib25519_pow_inv25519_BYTES);
+  unsigned char *ne = alignedcalloc(lib25519_pow_inv25519_BYTES);
+
+  for (long long impl = -1;impl < lib25519_numimpl_pow_inv25519();++impl) {
+    void (*crypto_pow)(unsigned char *,const unsigned char *);
+    if (targeti && strcmp(targeti,lib25519_dispatch_pow_inv25519_implementation(impl))) continue;
+    if (impl >= 0) {
+      crypto_pow = lib25519_dispatch_pow_inv25519(impl);
+      printf("pow_inv25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_pow_inv25519_implementation(impl),lib25519_dispatch_pow_inv25519_compiler(impl));
+    } else {
+      crypto_pow = lib25519_pow_inv25519;
+      printf("pow_inv25519 selected implementation %s compiler %s\n",lib25519_pow_inv25519_implementation(),lib25519_pow_inv25519_compiler());
+    }
+    randombytes(n,lib25519_pow_inv25519_BYTES);
+    randombytes(ne,lib25519_pow_inv25519_BYTES);
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      crypto_pow(ne,n);
+    }
+    t_print("pow_inv25519",impl,lib25519_pow_inv25519_BYTES);
+  }
+}
+
+static void measure_nP_montgomery25519(void)
+{
+  if (targeto && strcmp(targeto,"nP")) return;
+  if (targetp && strcmp(targetp,"montgomery25519")) return;
+  unsigned char *n = alignedcalloc(lib25519_nP_montgomery25519_SCALARBYTES);
+  unsigned char *P = alignedcalloc(lib25519_nP_montgomery25519_POINTBYTES);
+  unsigned char *nP = alignedcalloc(lib25519_nP_montgomery25519_POINTBYTES);
+
+  for (long long impl = -1;impl < lib25519_numimpl_nP_montgomery25519();++impl) {
+    void (*crypto_nP)(unsigned char *,const unsigned char *,const unsigned char *);
+    if (targeti && strcmp(targeti,lib25519_dispatch_nP_montgomery25519_implementation(impl))) continue;
+    if (impl >= 0) {
+      crypto_nP = lib25519_dispatch_nP_montgomery25519(impl);
+      printf("nP_montgomery25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_nP_montgomery25519_implementation(impl),lib25519_dispatch_nP_montgomery25519_compiler(impl));
+    } else {
+      crypto_nP = lib25519_nP_montgomery25519;
+      printf("nP_montgomery25519 selected implementation %s compiler %s\n",lib25519_nP_montgomery25519_implementation(),lib25519_nP_montgomery25519_compiler());
+    }
+    randombytes(n,lib25519_nP_montgomery25519_SCALARBYTES);
+    randombytes(P,lib25519_nP_montgomery25519_POINTBYTES);
+    randombytes(nP,lib25519_nP_montgomery25519_POINTBYTES);
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      crypto_nP(nP,n,P);
+    }
+    t_print("nP_montgomery25519",impl,lib25519_nP_montgomery25519_POINTBYTES);
+  }
+}
+
+static void measure_nG_merged25519(void)
+{
+  if (targeto && strcmp(targeto,"nG")) return;
+  if (targetp && strcmp(targetp,"merged25519")) return;
+  unsigned char *n = alignedcalloc(lib25519_nP_SCALARBYTES);
+  unsigned char *nG = alignedcalloc(lib25519_nP_POINTBYTES);
+
+  for (long long impl = -1;impl < lib25519_numimpl_nG_merged25519();++impl) {
+    void (*crypto_nG)(unsigned char *,const unsigned char *);
+    if (targeti && strcmp(targeti,lib25519_dispatch_nG_merged25519_implementation(impl))) continue;
+    if (impl >= 0) {
+      crypto_nG = lib25519_dispatch_nG_merged25519(impl);
+      printf("nG_merged25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_nG_merged25519_implementation(impl),lib25519_dispatch_nG_merged25519_compiler(impl));
+    } else {
+      crypto_nG = lib25519_nG_merged25519;
+      printf("nG_merged25519 selected implementation %s compiler %s\n",lib25519_nG_merged25519_implementation(),lib25519_nG_merged25519_compiler());
+    }
+    randombytes(n,lib25519_nP_SCALARBYTES);
+    randombytes(nG,lib25519_nP_POINTBYTES);
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      crypto_nG(nG,n);
+    }
+    t_print("nG_merged25519",impl,lib25519_nP_POINTBYTES);
+  }
+}
+
+static void measure_nG_montgomery25519(void)
+{
+  if (targeto && strcmp(targeto,"nG")) return;
+  if (targetp && strcmp(targetp,"montgomery25519")) return;
+  unsigned char *n = alignedcalloc(lib25519_nP_SCALARBYTES);
+  unsigned char *nG = alignedcalloc(lib25519_nP_POINTBYTES);
+
+  for (long long impl = -1;impl < lib25519_numimpl_nG_montgomery25519();++impl) {
+    void (*crypto_nG)(unsigned char *,const unsigned char *);
+    if (targeti && strcmp(targeti,lib25519_dispatch_nG_montgomery25519_implementation(impl))) continue;
+    if (impl >= 0) {
+      crypto_nG = lib25519_dispatch_nG_montgomery25519(impl);
+      printf("nG_montgomery25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_nG_montgomery25519_implementation(impl),lib25519_dispatch_nG_montgomery25519_compiler(impl));
+    } else {
+      crypto_nG = lib25519_nG_montgomery25519;
+      printf("nG_montgomery25519 selected implementation %s compiler %s\n",lib25519_nG_montgomery25519_implementation(),lib25519_nG_montgomery25519_compiler());
+    }
+    randombytes(n,lib25519_nP_SCALARBYTES);
+    randombytes(nG,lib25519_nP_POINTBYTES);
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      crypto_nG(nG,n);
+    }
+    t_print("nG_montgomery25519",impl,lib25519_nP_POINTBYTES);
+  }
+}
+
+static void measure_mGnP_ed25519(void)
+{
+  if (targeto && strcmp(targeto,"mGnP")) return;
+  if (targetp && strcmp(targetp,"ed25519")) return;
+  unsigned char *mGnP = alignedcalloc(lib25519_mGnP_ed25519_OUTPUTBYTES);
+  unsigned char *m = alignedcalloc(lib25519_mGnP_ed25519_MBYTES);
+  unsigned char *n = alignedcalloc(lib25519_mGnP_ed25519_NBYTES);
+  unsigned char *P = alignedcalloc(lib25519_mGnP_ed25519_PBYTES);
+
+  for (long long impl = -1;impl < lib25519_numimpl_mGnP_ed25519();++impl) {
+    void (*crypto_mGnP)(unsigned char *,const unsigned char *,const unsigned char *,const unsigned char *);
+    if (targeti && strcmp(targeti,lib25519_dispatch_mGnP_ed25519_implementation(impl))) continue;
+    if (impl >= 0) {
+      crypto_mGnP = lib25519_dispatch_mGnP_ed25519(impl);
+      printf("mGnP_ed25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_mGnP_ed25519_implementation(impl),lib25519_dispatch_mGnP_ed25519_compiler(impl));
+    } else {
+      crypto_mGnP = lib25519_mGnP_ed25519;
+      printf("mGnP_ed25519 selected implementation %s compiler %s\n",lib25519_mGnP_ed25519_implementation(),lib25519_mGnP_ed25519_compiler());
+    }
+    randombytes(mGnP,lib25519_mGnP_ed25519_OUTPUTBYTES);
+    randombytes(m,lib25519_mGnP_ed25519_MBYTES);
+    randombytes(n,lib25519_mGnP_ed25519_NBYTES);
+    randombytes(P,lib25519_mGnP_ed25519_PBYTES);
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      crypto_mGnP(mGnP,m,n,P);
+    }
+    t_print("mGnP_ed25519",impl,lib25519_mGnP_ed25519_OUTPUTBYTES);
+  }
+}
+
+static void measure_dh_x25519(void)
+{
+  if (targeto && strcmp(targeto,"dh")) return;
+  if (targetp && strcmp(targetp,"x25519")) return;
+  unsigned char *pka = alignedcalloc(lib25519_dh_x25519_PUBLICKEYBYTES);
+  unsigned char *ska = alignedcalloc(lib25519_dh_x25519_SECRETKEYBYTES);
+  unsigned char *pkb = alignedcalloc(lib25519_dh_x25519_PUBLICKEYBYTES);
+  unsigned char *skb = alignedcalloc(lib25519_dh_x25519_SECRETKEYBYTES);
+  unsigned char *ka = alignedcalloc(lib25519_dh_x25519_BYTES);
+
+  for (long long impl = -1;impl < lib25519_numimpl_dh_x25519();++impl) {
+    void (*crypto_dh_keypair)(unsigned char *,unsigned char *);
+    void (*crypto_dh)(unsigned char *,const unsigned char *,const unsigned char *);
+    if (targeti && strcmp(targeti,lib25519_dispatch_dh_x25519_implementation(impl))) continue;
+    if (impl >= 0) {
+      crypto_dh_keypair = lib25519_dispatch_dh_x25519_keypair(impl);
+      crypto_dh = lib25519_dispatch_dh_x25519(impl);
+      printf("dh_x25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_dh_x25519_implementation(impl),lib25519_dispatch_dh_x25519_compiler(impl));
+    } else {
+      crypto_dh_keypair = lib25519_dh_x25519_keypair;
+      crypto_dh = lib25519_dh_x25519;
+      printf("dh_x25519 selected implementation %s compiler %s\n",lib25519_dh_x25519_implementation(),lib25519_dh_x25519_compiler());
+    }
+    randombytes(pka,lib25519_dh_x25519_PUBLICKEYBYTES);
+    randombytes(ska,lib25519_dh_x25519_SECRETKEYBYTES);
+    randombytes(pkb,lib25519_dh_x25519_PUBLICKEYBYTES);
+    randombytes(skb,lib25519_dh_x25519_SECRETKEYBYTES);
+    randombytes(ka,lib25519_dh_x25519_BYTES);
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      crypto_dh_keypair(pka,ska);
+    }
+    t_print("dh_x25519_keypair",impl,lib25519_dh_x25519_PUBLICKEYBYTES);
+    crypto_dh_keypair(pkb,skb);
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      crypto_dh(ka,pkb,ska);
+    }
+    t_print("dh_x25519",impl,lib25519_dh_x25519_BYTES);
+  }
+}
+
+static void measure_sign_ed25519(void)
+{
+  if (targeto && strcmp(targeto,"sign")) return;
+  if (targetp && strcmp(targetp,"ed25519")) return;
+  unsigned char *pk = alignedcalloc(lib25519_sign_ed25519_PUBLICKEYBYTES);
+  unsigned char *sk = alignedcalloc(lib25519_sign_ed25519_SECRETKEYBYTES);
+  unsigned char *m = alignedcalloc(MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
+  unsigned char *sm = alignedcalloc(MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
+  unsigned char *m2 = alignedcalloc(MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
+  long long mlen;
+  long long smlen;
+  long long m2len;
+
+  for (long long impl = -1;impl < lib25519_numimpl_sign_ed25519();++impl) {
+    void (*crypto_sign_keypair)(unsigned char *,unsigned char *);
+    void (*crypto_sign)(unsigned char *,long long *,const unsigned char *,long long,const unsigned char *);
+    int (*crypto_sign_open)(unsigned char *,long long *,const unsigned char *,long long,const unsigned char *);
+    if (targeti && strcmp(targeti,lib25519_dispatch_sign_ed25519_implementation(impl))) continue;
+    if (impl >= 0) {
+      crypto_sign_keypair = lib25519_dispatch_sign_ed25519_keypair(impl);
+      crypto_sign = lib25519_dispatch_sign_ed25519(impl);
+      crypto_sign_open = lib25519_dispatch_sign_ed25519_open(impl);
+      printf("sign_ed25519 %lld implementation %s compiler %s\n",impl,lib25519_dispatch_sign_ed25519_implementation(impl),lib25519_dispatch_sign_ed25519_compiler(impl));
+    } else {
+      crypto_sign_keypair = lib25519_sign_ed25519_keypair;
+      crypto_sign = lib25519_sign_ed25519;
+      crypto_sign_open = lib25519_sign_ed25519_open;
+      printf("sign_ed25519 selected implementation %s compiler %s\n",lib25519_sign_ed25519_implementation(),lib25519_sign_ed25519_compiler());
+    }
+    randombytes(pk,lib25519_sign_ed25519_PUBLICKEYBYTES);
+    randombytes(sk,lib25519_sign_ed25519_SECRETKEYBYTES);
+    randombytes(m,MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
+    randombytes(sm,MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
+    randombytes(m2,MAXTEST_BYTES+lib25519_sign_ed25519_BYTES);
+    for (long long i = 0;i <= TIMINGS;++i) {
+      t[i] = cpucycles();
+      crypto_sign_keypair(pk,sk);
+    }
+    t_print("sign_ed25519_keypair",impl,lib25519_sign_ed25519_PUBLICKEYBYTES);
+    mlen = 0;
+    while (mlen <= MAXTEST_BYTES) {
+      randombytes(m,mlen);
+      for (long long i = 0;i <= TIMINGS;++i) {
+        t[i] = cpucycles();
+        crypto_sign(sm,&smlen,m,mlen,sk);
+      }
+      t_print("sign_ed25519",impl,mlen);
+      mlen += 1+mlen/4;
+    }
+    mlen = 0;
+    while (mlen <= MAXTEST_BYTES) {
+      randombytes(m,mlen);
+      lib25519_sign(sm,&smlen,m,mlen,sk);
+      for (long long i = 0;i <= TIMINGS;++i) {
+        t[i] = cpucycles();
+        crypto_sign_open(m2,&m2len,sm,smlen,pk);
+      }
+      t_print("sign_ed25519_open",impl,mlen);
+      /* this is, in principle, not a test program */
+      /* but some checks here help validate the data flow above */
+      assert(m2len == mlen);
+      assert(!memcmp(m,m2,mlen));
+      mlen += 1+mlen/4;
+    }
+  }
+}
+
+#include "print_cpuid.inc"
+
+int main(int argc,char **argv)
+{
+  printf("lib25519 version %s\n",lib25519_version);
+  printf("lib25519 arch %s\n",lib25519_arch);
+  print_cpuid();
+
+  if (*argv) ++argv;
+  if (*argv) {
+    targeto = *argv++;
+    if (*argv) {
+      targetp = *argv++;
+      if (*argv) {
+        targeti = *argv++;
+      }
+    }
+  }
+
+  measure_cpucycles();
+  measure_randombytes();
+  limits();
+  measure_verify_32();
+  measure_hashblocks_sha512();
+  measure_hash_sha512();
+  measure_pow_inv25519();
+  measure_nP_montgomery25519();
+  measure_nG_merged25519();
+  measure_nG_montgomery25519();
+  measure_mGnP_ed25519();
+  measure_dh_x25519();
+  measure_sign_ed25519();
+
+  return 0;
+}
@@ -0,0 +1,17 @@
+static void limits()
+{
+#ifdef RLIM_INFINITY
+  struct rlimit r;
+  r.rlim_cur = 0;
+  r.rlim_max = 0;
+#ifdef RLIMIT_NOFILE
+  setrlimit(RLIMIT_NOFILE,&r);
+#endif
+#ifdef RLIMIT_NPROC
+  setrlimit(RLIMIT_NPROC,&r);
+#endif
+#ifdef RLIMIT_CORE
+  setrlimit(RLIMIT_CORE,&r);
+#endif
+#endif
+}
@@ -0,0 +1,9 @@
+static void print_cpuid(void)
+{
+  unsigned int cpuid[32];
+  lib25519_cpuid(cpuid,32);
+  printf("cpuid");
+  for (long long j = 0;j < 32;++j)
+    printf(" %08x",cpuid[j]);
+  printf("\n");
+}
@@ -0,0 +1,2 @@
+gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mtune=sandybridge
+clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mtune=sandybridge
@@ -0,0 +1,2 @@
+gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -mtune=haswell
+clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -mtune=haswell
@@ -0,0 +1,2 @@
+gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mtune=skylake
+clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mtune=skylake
@@ -0,0 +1,2 @@
+gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mavx512f -mavx512vl -mtune=skylake
+clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mavx512f -mavx512vl -mtune=skylake
@@ -0,0 +1,2 @@
+gcc -Wall -fPIC -fwrapv -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mavx512f -mavx512vl -mavx512ifma -mtune=skylake
+clang -Wall -fPIC -fwrapv -Qunused-arguments -O2 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mbmi -mbmi2 -mavx2 -madx -mavx512f -mavx512vl -mavx512ifma -mtune=skylake
@@ -0,0 +1,37 @@
+#define CPUID(func,leaf,a,b,c,d) \
+  __asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
+
+#define WANT_1_3 ((1<<23)|(1<<25)|(1<<26))
+/* 23=mmx; 25=sse; 26=sse2 */
+
+#define WANT_1_2 ((1<<0)|(1<<9)|(1<<19)|(1<<20)|(1<<27)|(1<<28))
+/* 0=sse3; 9=ssse3; 19=sse41; 20=sse42; 27=osxsave; 28=avx */
+
+#define WANT_7_1 ((1<<3)|(1<<5)|(1<<8)|(1<<16)|(1<<19)|(1<<21)|(1<<31))
+/* 3=bmi1; 5=avx2; 8=bmi2; 16=avx512_f; 19=adx; 21=avx512_ifma; 31=avx512_vl */
+
+#define WANT_XCR ((1<<1)|(1<<2))
+/* 1=xmm; 2=ymm */
+
+int supports(void)
+{
+  unsigned int cpuidmax,id0,id1,id2;
+  unsigned int familymodelstepping;
+  unsigned int feature0,feature1,feature2,feature3;
+  unsigned int xcrlow,xcrhigh;
+
+  CPUID(0,0,cpuidmax,id0,id1,id2);
+  if (cpuidmax < 7) return 0;
+
+  CPUID(1,0,familymodelstepping,feature1,feature2,feature3);
+  if (WANT_1_2 != (WANT_1_2 & feature2)) return 0;
+  if (WANT_1_3 != (WANT_1_3 & feature3)) return 0;
+
+  CPUID(7,0,feature0,feature1,feature2,feature3);
+  if (WANT_7_1 != (WANT_7_1 & feature1)) return 0;
+
+  asm(".byte 15;.byte 1;.byte 208":"=a"(xcrlow),"=d"(xcrhigh):"c"(0));
+  if (WANT_XCR != (WANT_XCR & xcrlow)) return 0;
+
+  return 1;
+}
@@ -0,0 +1,37 @@
+#define CPUID(func,leaf,a,b,c,d) \
+  __asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
+
+#define WANT_1_3 ((1<<23)|(1<<25)|(1<<26))
+/* 23=mmx; 25=sse; 26=sse2 */
+
+#define WANT_1_2 ((1<<0)|(1<<9)|(1<<19)|(1<<20)|(1<<27)|(1<<28))
+/* 0=sse3; 9=ssse3; 19=sse41; 20=sse42; 27=osxsave; 28=avx */
+
+#define WANT_7_1 ((1<<3)|(1<<5)|(1<<8)|(1<<16)|(1<<19)|(1<<31))
+/* 3=bmi1; 5=avx2; 8=bmi2; 16=avx512_f; 19=adx; 31=avx512_vl */
+
+#define WANT_XCR ((1<<1)|(1<<2))
+/* 1=xmm; 2=ymm */
+
+int supports(void)
+{
+  unsigned int cpuidmax,id0,id1,id2;
+  unsigned int familymodelstepping;
+  unsigned int feature0,feature1,feature2,feature3;
+  unsigned int xcrlow,xcrhigh;
+
+  CPUID(0,0,cpuidmax,id0,id1,id2);
+  if (cpuidmax < 7) return 0;
+
+  CPUID(1,0,familymodelstepping,feature1,feature2,feature3);
+  if (WANT_1_2 != (WANT_1_2 & feature2)) return 0;
+  if (WANT_1_3 != (WANT_1_3 & feature3)) return 0;
+
+  CPUID(7,0,feature0,feature1,feature2,feature3);
+  if (WANT_7_1 != (WANT_7_1 & feature1)) return 0;
+
+  asm(".byte 15;.byte 1;.byte 208":"=a"(xcrlow),"=d"(xcrhigh):"c"(0));
+  if (WANT_XCR != (WANT_XCR & xcrlow)) return 0;
+
+  return 1;
+}
@@ -0,0 +1,37 @@
+#define CPUID(func,leaf,a,b,c,d) \
+  __asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
+
+#define WANT_1_3 ((1<<23)|(1<<25)|(1<<26))
+/* 23=mmx; 25=sse; 26=sse2 */
+
+#define WANT_1_2 ((1<<0)|(1<<9)|(1<<19)|(1<<20)|(1<<27)|(1<<28))
+/* 0=sse3; 9=ssse3; 19=sse41; 20=sse42; 27=osxsave; 28=avx */
+
+#define WANT_7_1 ((1<<3)|(1<<5)|(1<<8)|(1<<19))
+/* 3=bmi1; 5=avx2; 8=bmi2; 19=adx */
+
+#define WANT_XCR ((1<<1)|(1<<2))
+/* 1=xmm; 2=ymm */
+
+int supports(void)
+{
+  unsigned int cpuidmax,id0,id1,id2;
+  unsigned int familymodelstepping;
+  unsigned int feature0,feature1,feature2,feature3;
+  unsigned int xcrlow,xcrhigh;
+
+  CPUID(0,0,cpuidmax,id0,id1,id2);
+  if (cpuidmax < 7) return 0;
+
+  CPUID(1,0,familymodelstepping,feature1,feature2,feature3);
+  if (WANT_1_2 != (WANT_1_2 & feature2)) return 0;
+  if (WANT_1_3 != (WANT_1_3 & feature3)) return 0;
+
+  CPUID(7,0,feature0,feature1,feature2,feature3);
+  if (WANT_7_1 != (WANT_7_1 & feature1)) return 0;
+
+  asm(".byte 15;.byte 1;.byte 208":"=a"(xcrlow),"=d"(xcrhigh):"c"(0));
+  if (WANT_XCR != (WANT_XCR & xcrlow)) return 0;
+
+  return 1;
+}
@@ -0,0 +1,53 @@
+/*
+gcc has __builtin_cpu_supports("avx2")
+but implemented it incorrectly until 2018:
+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85100
+
+as of 2022, many machines still have buggy versions of gcc
+
+furthermore, why is checking just for avx2 enough?
+has intel guaranteed that it will never introduce
+a cpu with avx2 instructions and without (e.g.) sse4.2?
+
+so manually check cpuid and xgetbv here
+and include all the "lower" instruction sets
+rather than trying to guess which ones are implied
+*/
+
+#define CPUID(func,leaf,a,b,c,d) \
+  __asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
+
+#define WANT_1_3 ((1<<23)|(1<<25)|(1<<26))
+/* 23=mmx; 25=sse; 26=sse2 */
+
+#define WANT_1_2 ((1<<0)|(1<<9)|(1<<19)|(1<<20)|(1<<27)|(1<<28))
+/* 0=sse3; 9=ssse3; 19=sse41; 20=sse42; 27=osxsave; 28=avx */
+
+#define WANT_7_1 ((1<<3)|(1<<5)|(1<<8))
+/* 3=bmi1; 5=avx2; 8=bmi2 */
+
+#define WANT_XCR ((1<<1)|(1<<2))
+/* 1=xmm; 2=ymm */
+
+int supports(void)
+{
+  unsigned int cpuidmax,id0,id1,id2;
+  unsigned int familymodelstepping;
+  unsigned int feature0,feature1,feature2,feature3;
+  unsigned int xcrlow,xcrhigh;
+
+  CPUID(0,0,cpuidmax,id0,id1,id2);
+  if (cpuidmax < 7) return 0;
+
+  CPUID(1,0,familymodelstepping,feature1,feature2,feature3);
+  if (WANT_1_2 != (WANT_1_2 & feature2)) return 0;
+  if (WANT_1_3 != (WANT_1_3 & feature3)) return 0;
+
+  CPUID(7,0,feature0,feature1,feature2,feature3);
+  if (WANT_7_1 != (WANT_7_1 & feature1)) return 0;
+
+  asm(".byte 15;.byte 1;.byte 208":"=a"(xcrlow),"=d"(xcrhigh):"c"(0));
+  if (WANT_XCR != (WANT_XCR & xcrlow)) return 0;
+
+  return 1;
+}
@@ -0,0 +1,31 @@
+#define CPUID(func,leaf,a,b,c,d) \
+  __asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
+
+#define WANT_1_3 ((1<<23)|(1<<25)|(1<<26))
+/* 23=mmx; 25=sse; 26=sse2 */
+
+#define WANT_1_2 ((1<<0)|(1<<9)|(1<<19)|(1<<20)|(1<<27)|(1<<28))
+/* 0=sse3; 9=ssse3; 19=sse41; 20=sse42; 27=osxsave; 28=avx */
+
+#define WANT_XCR ((1<<1)|(1<<2))
+/* 1=xmm; 2=ymm */
+
+int supports(void)
+{
+  unsigned int cpuidmax,id0,id1,id2;
+  unsigned int familymodelstepping;
+  unsigned int feature1,feature2,feature3;
+  unsigned int xcrlow,xcrhigh;
+
+  CPUID(0,0,cpuidmax,id0,id1,id2);
+  if (cpuidmax < 1) return 0;
+
+  CPUID(1,0,familymodelstepping,feature1,feature2,feature3);
+  if (WANT_1_2 != (WANT_1_2 & feature2)) return 0;
+  if (WANT_1_3 != (WANT_1_3 & feature3)) return 0;
+
+  asm(".byte 15;.byte 1;.byte 208":"=a"(xcrlow),"=d"(xcrhigh):"c"(0));
+  if (WANT_XCR != (WANT_XCR & xcrlow)) return 0;
+
+  return 1;
+}
@@ -0,0 +1,2 @@
+gcc -Wall -fPIC -fwrapv -O2
+clang -Wall -fPIC -fwrapv -Qunused-arguments -O2
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <linux/perf_event.h>
+
+static struct perf_event_attr attr;
+static int fdperf = -1;
+static struct perf_event_mmap_page *buf = 0;
+
+long long ticks_setup(void)
+{
+  if (fdperf == -1) {
+    attr.type = PERF_TYPE_HARDWARE;
+    attr.config = PERF_COUNT_HW_CPU_CYCLES;
+    attr.exclude_kernel = 1;
+    fdperf = syscall(__NR_perf_event_open,&attr,0,-1,-1,0);
+    if (fdperf == -1) return 0;
+    buf = mmap(NULL,sysconf(_SC_PAGESIZE),PROT_READ,MAP_SHARED,fdperf,0);
+  }
+  return -1;
+}
+
+long long ticks(void)
+{
+  long long result;
+  unsigned int seq;
+  long long index;
+  long long offset;
+
+  do {
+    seq = buf->lock;
+    asm volatile("" ::: "memory");
+    index = buf->index;
+    offset = buf->offset;
+    asm volatile("rdpmc;shlq $32,%%rdx;orq %%rdx,%%rax"
+      : "=a"(result) : "c"(index-1) : "%rdx");
+    asm volatile("" ::: "memory");
+  } while (buf->lock != seq);
+
+  result += offset;
+  result &= 0xffffffffffff;
+  return result;
+}
@@ -0,0 +1,12 @@
+long long ticks_setup(void)
+{
+  return -2;
+}
+
+long long ticks(void)
+{
+  unsigned long long result;
+  asm volatile(".byte 15;.byte 49;shlq $32,%%rdx;orq %%rdx,%%rax"
+    : "=a"(result) :: "%rdx");
+  return result;
+}
@@ -0,0 +1,9 @@
+#ifndef cpucycles_h
+#define cpucycles_h
+
+extern long long (*cpucycles)(void) __attribute__((visibility("default")));;
+extern long long cpucycles_init(void) __attribute__((visibility("default")));;
+extern const char *cpucycles_implementation(void) __attribute__((visibility("default")));;
+extern long long cpucycles_persecond(void) __attribute__((visibility("default")));;
+
+#endif
@@ -0,0 +1,19 @@
+#include <time.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+long long ticks_setup(void)
+{
+  return 1000000;
+}
+
+long long ticks(void)
+{
+  struct timeval t;
+  long long result;
+  gettimeofday(&t,(struct timezone *) 0);
+  result = t.tv_sec;
+  result *= 1000000;
+  result += t.tv_usec;
+  return result;
+}
@@ -0,0 +1,18 @@
+#include <time.h>
+#include <sys/time.h>
+
+long long ticks_setup(void)
+{
+  return 1000000000;
+}
+
+long long ticks(void)
+{
+  struct timespec t;
+  long long result;
+  clock_gettime(CLOCK_MONOTONIC,&t);
+  result = t.tv_sec;
+  result *= 1000000000;
+  result += t.tv_nsec;
+  return result;
+}
@@ -0,0 +1,9 @@
+long long ticks_setup(void)
+{
+  return 0;
+}
+
+long long ticks(void)
+{
+  return 0;
+}
@@ -0,0 +1,4 @@
+amd64-pmc
+amd64-tsc
+default-monotonic
+default-gettimeofday
@@ -0,0 +1,216 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "cpucycles.h"
+
+static double osfreq(void)
+{
+  FILE *f;
+  char *x;
+  double result;
+  int s;
+
+  f = fopen("/etc/cpucyclespersecond", "r");
+  if (f) {
+    s = fscanf(f,"%lf",&result);
+    fclose(f);
+    if (s > 0) return result;
+  }
+
+  f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed", "r");
+  if (f) {
+    s = fscanf(f,"%lf",&result);
+    fclose(f);
+    if (s > 0) return 1000.0 * result;
+  }
+
+  f = fopen("/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", "r");
+  if (f) {
+    s = fscanf(f,"%lf",&result);
+    fclose(f);
+    if (s > 0) return 1000.0 * result;
+  }
+
+  f = fopen("/sys/devices/system/cpu/cpu0/clock_tick", "r");
+  if (f) {
+    s = fscanf(f,"%lf",&result);
+    fclose(f);
+    if (s > 0) return result;
+  }
+
+  f = fopen("/proc/cpuinfo","r");
+  if (f) {
+    for (;;) {
+      s = fscanf(f,"cpu MHz : %lf",&result);
+      if (s > 0) break;
+      if (s == 0) s = fscanf(f,"%*[^\n]\n");
+      if (s < 0) { result = 0; break; }
+    }
+    fclose(f);
+    if (result) return 1000000.0 * result;
+  }
+
+  f = fopen("/proc/cpuinfo","r");
+  if (f) {
+    for (;;) {
+      s = fscanf(f,"clock : %lf",&result);
+      if (s > 0) break;
+      if (s == 0) s = fscanf(f,"%*[^\n]\n");
+      if (s < 0) { result = 0; break; }
+    }
+    fclose(f);
+    if (result) return 1000000.0 * result;
+  }
+
+  f = popen("sysctl hw.cpufrequency 2>/dev/null","r");
+  if (f) {
+    s = fscanf(f,"hw.cpufrequency: %lf",&result);
+    pclose(f);
+    if (s > 0) if (result > 0) return result;
+  }
+
+  f = popen("/usr/sbin/lsattr -E -l proc0 -a frequency 2>/dev/null","r");
+  if (f) {
+    s = fscanf(f,"frequency %lf",&result);
+    pclose(f);
+    if (s > 0) return result;
+  }
+
+  f = popen("/usr/sbin/psrinfo -v 2>/dev/null","r");
+  if (f) {
+    for (;;) {
+      s = fscanf(f," The %*s processor operates at %lf MHz",&result);
+      if (s > 0) break;
+      if (s == 0) s = fscanf(f,"%*[^\n]\n");
+      if (s < 0) { result = 0; break; }
+    }
+    pclose(f);
+    if (result) return 1000000.0 * result;
+  }
+
+  x = getenv("cpucyclespersecond");
+  if (x) {
+    s = sscanf(x,"%lf",&result);
+    if (s > 0) return result;
+  }
+
+  return 0;
+}
+
+static long long persecond = 0;
+const char *implementation = "none";
+
+long long (*cpucycles)(void) = cpucycles_init;
+
+const char *cpucycles_implementation(void)
+{
+  cpucycles();
+  return implementation;
+}
+
+long long cpucycles_persecond(void)
+{
+  cpucycles();
+  return persecond;
+}
+
+static double cpucycles_scaled_scaling = 0;
+static long long (*cpucycles_scaled_from)(void) = 0;
+
+static long long cpucycles_scaled(void)
+{
+  return cpucycles_scaled_from()*cpucycles_scaled_scaling;
+}
+
+#include "options.inc"
+
+#define CALLS 1000
+
+long long cpucycles_init(void)
+{
+  long long precision[NUMOPTIONS];
+  long long scaling[NUMOPTIONS];
+  long long bestprecision;
+  long long bestopt;
+
+  persecond = osfreq();
+
+  for (long long opt = 0;opt < NUMOPTIONS;++opt) {
+    long long freq = options[opt].ticks_setup();
+
+    // freq > 0: freq ticks per second
+    // freq == 0: do not use
+    // freq == -1: cycle counter (e.g., rdpmc)
+    // freq == -2: probably cycle counter (e.g., rdtsc)
+    // freq == -3: tick counter every N cycles for some unknown N
+
+    precision[opt] = 0;
+
+    if (freq > 0) { // means: freq ticks per second
+      scaling[opt] = persecond*1.0/freq;
+    } else if (freq == -1) { // means: cycle counter; e.g., rdpmc
+      scaling[opt] = 1.0;
+    } else if (freq == -2) { // means: probably cycle counter; e.g., rdtsc
+      scaling[opt] = 1.0;
+    } else {
+      continue;
+    }
+
+    for (long long tries = 0;tries < 10;++tries) {
+      long long t[CALLS+1];
+      long long ok = 1;
+
+      if (scaling[opt] == 1.0) {
+        for (long long i = 0;i <= CALLS;++i)
+          t[i] = options[opt].ticks();
+      } else {
+        double scalingopt = scaling[opt];
+        for (long long i = 0;i <= CALLS;++i)
+          t[i] = options[opt].ticks()*scalingopt;
+      }
+      for (long long i = 0;i < CALLS;++i)
+        if (t[i] > t[i+1])
+          ok = 0;
+      if (t[0] == t[CALLS])
+        ok = 0;
+
+      if (ok) {
+        long long smallestdiff = 0;
+        for (long long i = 0;i < CALLS;++i) {
+          long long diff = t[i+1]-t[i];
+          if (diff <= 0) continue;
+          if (smallestdiff == 0 || diff < smallestdiff)
+            smallestdiff = diff;
+        }
+        precision[opt] = smallestdiff;
+        if (freq != -1)
+          precision[opt] += 100;
+        break;
+      }
+
+      // otherwise keep trying
+      // since !ok can be caused by overflow
+      // or by core swap
+    }
+  }
+
+  bestopt = DEFAULTOPTION;
+  bestprecision = 0;
+  for (long long opt = 0;opt < NUMOPTIONS;++opt)
+    if (precision[opt] > 0)
+      if (!bestprecision || precision[opt] < bestprecision) {
+        bestopt = opt;
+        bestprecision = precision[opt];
+      }
+
+  implementation = options[bestopt].implementation;
+  
+  if (scaling[bestopt] == 1.0) {
+    cpucycles = options[bestopt].ticks;
+  } else {
+    cpucycles_scaled_scaling = scaling[bestopt];
+    cpucycles_scaled_from = options[bestopt].ticks;
+    cpucycles = cpucycles_scaled;
+  }
+
+  return cpucycles();
+}
@@ -0,0 +1,73 @@
+#include <stdio.h>
+
+#define CPUID(func,leaf,a,b,c,d) \
+  __asm("cpuid":"=a"(a),"=b"(b),"=c"(c),"=d"(d):"a"(func),"c"(leaf):)
+
+__attribute__((visibility("default")))
+void lib25519_cpuid(unsigned int *result,long long resultlen)
+{
+  unsigned int a,b,c,d;
+  unsigned int cpuidmax,extendedcpuidmax;
+  int havexgetbv = 0;
+
+  CPUID(0,0,a,b,c,d);
+  cpuidmax = a;
+  if (resultlen > 0) { *result++ = b; --resultlen; }
+  if (resultlen > 0) { *result++ = c; --resultlen; }
+  if (resultlen > 0) { *result++ = d; --resultlen; }
+
+  a = b = c = d = 0;
+  CPUID(0x80000000,0,a,b,c,d);
+  extendedcpuidmax = a;
+
+  a = b = c = d = 0;
+  if (extendedcpuidmax >= 0x80000002) CPUID(0x80000002,0,a,b,c,d);
+  if (resultlen > 0) { *result++ = a; --resultlen; }
+  if (resultlen > 0) { *result++ = b; --resultlen; }
+  if (resultlen > 0) { *result++ = c; --resultlen; }
+  if (resultlen > 0) { *result++ = d; --resultlen; }
+
+  a = b = c = d = 0;
+  if (extendedcpuidmax >= 0x80000003) CPUID(0x80000003,0,a,b,c,d);
+  if (resultlen > 0) { *result++ = a; --resultlen; }
+  if (resultlen > 0) { *result++ = b; --resultlen; }
+  if (resultlen > 0) { *result++ = c; --resultlen; }
+  if (resultlen > 0) { *result++ = d; --resultlen; }
+
+  a = b = c = d = 0;
+  if (extendedcpuidmax >= 0x80000004) CPUID(0x80000004,0,a,b,c,d);
+  if (resultlen > 0) { *result++ = a; --resultlen; }
+  if (resultlen > 0) { *result++ = b; --resultlen; }
+  if (resultlen > 0) { *result++ = c; --resultlen; }
+  if (resultlen > 0) { *result++ = d; --resultlen; }
+
+  a = b = c = d = 0;
+  if (cpuidmax >= 1) CPUID(1,0,a,b,c,d);
+  if (resultlen > 0) { *result++ = a; --resultlen; }
+  if (resultlen > 0) { *result++ = b; --resultlen; }
+  if (resultlen > 0) { *result++ = c; --resultlen; }
+  if (resultlen > 0) { *result++ = d; --resultlen; }
+  /* 27=osxsave; 28=avx */
+  if (((1<<27)|(1<<28)) == (((1<<27)|(1<<28)) & c))
+    havexgetbv = 1;
+
+  a = b = c = d = 0;
+  if (cpuidmax >= 7) CPUID(7,0,a,b,c,d);
+  if (resultlen > 0) { *result++ = a; --resultlen; }
+  if (resultlen > 0) { *result++ = b; --resultlen; }
+  if (resultlen > 0) { *result++ = c; --resultlen; }
+  if (resultlen > 0) { *result++ = d; --resultlen; }
+
+  a = b = c = d = 0;
+  if (extendedcpuidmax >= 0x80000001) CPUID(0x80000001,0,a,b,c,d);
+  if (resultlen > 0) { *result++ = a; --resultlen; }
+  if (resultlen > 0) { *result++ = b; --resultlen; }
+  if (resultlen > 0) { *result++ = c; --resultlen; }
+  if (resultlen > 0) { *result++ = d; --resultlen; }
+
+  a = b = c = d = 0;
+  if (havexgetbv) asm(".byte 15;.byte 1;.byte 208":"=a"(a),"=d"(d):"c"(0));
+  if (resultlen > 0) { *result++ = a; --resultlen; }
+
+  while (resultlen > 0) { *result++ = 0; --resultlen; }
+}
@@ -0,0 +1,7 @@
+#include <stdio.h>
+
+__attribute__((visibility("default")))
+void lib25519_cpuid(unsigned int *result,long long resultlen)
+{
+  while (resultlen > 0) { *result++ = 0; --resultlen; }
+}
@@ -0,0 +1,9 @@
+#include "crypto_nP_montgomery25519.h"
+#include "crypto_dh.h"
+
+void crypto_dh(unsigned char *abshared,
+                      const unsigned char *bobpk,
+                      const unsigned char *alicesk)
+{
+  crypto_nP_montgomery25519(abshared,alicesk,bobpk);
+}
@@ -0,0 +1,9 @@
+#include "crypto_nG_montgomery25519.h"
+#include "randombytes.h"
+#include "crypto_dh.h"
+
+void crypto_dh_keypair(unsigned char *pk,unsigned char *sk)
+{
+  randombytes(sk,crypto_dh_SECRETKEYBYTES);
+  crypto_nG_montgomery25519(pk,sk);
+}
@@ -0,0 +1 @@
+../ref/api.h
@@ -0,0 +1 @@
+../../../crypto_pow/inv25519/sandy2x/architectures
@@ -0,0 +1,78 @@
+#include <immintrin.h>
+#include "crypto_hashblocks_sha512.h"
+#include "crypto_hash.h"
+
+#define blocks crypto_hashblocks_sha512
+
+#define ALIGNED __attribute((aligned(32)))
+
+static const ALIGNED unsigned char iv[64] = {
+  0x6a,0x09,0xe6,0x67,0xf3,0xbc,0xc9,0x08,
+  0xbb,0x67,0xae,0x85,0x84,0xca,0xa7,0x3b,
+  0x3c,0x6e,0xf3,0x72,0xfe,0x94,0xf8,0x2b,
+  0xa5,0x4f,0xf5,0x3a,0x5f,0x1d,0x36,0xf1,
+  0x51,0x0e,0x52,0x7f,0xad,0xe6,0x82,0xd1,
+  0x9b,0x05,0x68,0x8c,0x2b,0x3e,0x6c,0x1f,
+  0x1f,0x83,0xd9,0xab,0xfb,0x41,0xbd,0x6b,
+  0x5b,0xe0,0xcd,0x19,0x13,0x7e,0x21,0x79
+} ;
+
+typedef unsigned long long uint64;
+
+#define load256(x) (_mm256_loadu_si256((void *) (x)))
+#define store256(x,y) (_mm256_storeu_si256((void *) (x),y))
+
+void crypto_hash(unsigned char *out,const unsigned char *in,long long inlen)
+{
+  ALIGNED unsigned char h[64];
+  ALIGNED unsigned char padded[256];
+  unsigned long long i;
+  unsigned long long bytes = inlen;
+  __m256i X0,X1;
+
+  X0 = load256(iv);
+  X1 = load256(iv + 32);
+
+  store256(h,X0);
+  store256(h + 32,X1);
+
+  blocks(h,in,inlen);
+  in += inlen;
+  inlen &= 127;
+  in -= inlen;
+
+  X0 ^= X0;
+
+  if (inlen < 112) {
+    store256(padded,X0);
+    store256(padded + 32,X0);
+    store256(padded + 64,X0);
+    store256(padded + 96,X0);
+
+    for (i = 0;i < inlen;++i) padded[i] = in[i];
+    padded[inlen] = 0x80;
+
+    padded[119] = bytes >> 61;
+    *(uint64 *) (padded + 120) = __builtin_bswap64(bytes << 3);
+    blocks(h,padded,128);
+  } else {
+    store256(padded + 96,X0);
+    store256(padded + 128,X0);
+    store256(padded + 160,X0);
+    store256(padded + 192,X0);
+    store256(padded + 224,X0);
+
+    for (i = 0;i < inlen;++i) padded[i] = in[i];
+    padded[inlen] = 0x80;
+
+    padded[247] = bytes >> 61;
+    *(uint64 *) (padded + 248) = __builtin_bswap64(bytes << 3);
+    blocks(h,padded,256);
+  }
+
+  X0 = load256(h);
+  X1 = load256(h + 32);
+
+  store256(out,X0);
+  store256(out + 32,X1);
+}
@@ -0,0 +1 @@
+../ref/implementors
@@ -0,0 +1 @@
+#define CRYPTO_BYTES 64
@@ -0,0 +1,69 @@
+/*
+20080913
+D. J. Bernstein
+Public domain.
+*/
+
+#include "crypto_hashblocks_sha512.h"
+#include "crypto_hash.h"
+
+#define blocks crypto_hashblocks_sha512
+
+static const unsigned char iv[64] = {
+  0x6a,0x09,0xe6,0x67,0xf3,0xbc,0xc9,0x08,
+  0xbb,0x67,0xae,0x85,0x84,0xca,0xa7,0x3b,
+  0x3c,0x6e,0xf3,0x72,0xfe,0x94,0xf8,0x2b,
+  0xa5,0x4f,0xf5,0x3a,0x5f,0x1d,0x36,0xf1,
+  0x51,0x0e,0x52,0x7f,0xad,0xe6,0x82,0xd1,
+  0x9b,0x05,0x68,0x8c,0x2b,0x3e,0x6c,0x1f,
+  0x1f,0x83,0xd9,0xab,0xfb,0x41,0xbd,0x6b,
+  0x5b,0xe0,0xcd,0x19,0x13,0x7e,0x21,0x79
+} ;
+
+typedef unsigned long long uint64;
+
+void crypto_hash(unsigned char *out,const unsigned char *in,long long inlen)
+{
+  unsigned char h[64];
+  unsigned char padded[256];
+  int i;
+  unsigned long long bytes = inlen;
+
+  for (i = 0;i < 64;++i) h[i] = iv[i];
+
+  blocks(h,in,inlen);
+  in += inlen;
+  inlen &= 127;
+  in -= inlen;
+
+  for (i = 0;i < inlen;++i) padded[i] = in[i];
+  padded[inlen] = 0x80;
+
+  if (inlen < 112) {
+    for (i = inlen + 1;i < 119;++i) padded[i] = 0;
+    padded[119] = bytes >> 61;
+    padded[120] = bytes >> 53;
+    padded[121] = bytes >> 45;
+    padded[122] = bytes >> 37;
+    padded[123] = bytes >> 29;
+    padded[124] = bytes >> 21;
+    padded[125] = bytes >> 13;
+    padded[126] = bytes >> 5;
+    padded[127] = bytes << 3;
+    blocks(h,padded,128);
+  } else {
+    for (i = inlen + 1;i < 247;++i) padded[i] = 0;
+    padded[247] = bytes >> 61;
+    padded[248] = bytes >> 53;
+    padded[249] = bytes >> 45;
+    padded[250] = bytes >> 37;
+    padded[251] = bytes >> 29;
+    padded[252] = bytes >> 21;
+    padded[253] = bytes >> 13;
+    padded[254] = bytes >> 5;
+    padded[255] = bytes << 3;
+    blocks(h,padded,256);
+  }
+
+  for (i = 0;i < 64;++i) out[i] = h[i];
+}
@@ -0,0 +1 @@
+Daniel J. Bernstein (wrapper around crypto_hashblocks/sha512)
@@ -0,0 +1 @@
+../m3/api.h
@@ -0,0 +1,2 @@
+amd64 avx2
+x86 avx2
@@ -0,0 +1 @@
+../m3/constants.c
@@ -0,0 +1 @@
+../m3/implementors
@@ -0,0 +1,249 @@
+#include <immintrin.h>
+#include "inner.h"
+
+#define uint64 crypto_uint64
+
+static uint64 load_bigendian(const unsigned char *x)
+{
+  return __builtin_bswap64(*(uint64 *) x);
+}
+
+static void store_bigendian(unsigned char *x,uint64 u)
+{
+  *(uint64 *) x = __builtin_bswap64(u);
+}
+
+#define SHR(x,c) ((x) >> (c))
+#define ROTR(x,c) (((x) >> (c)) | ((x) << (64 - (c))))
+#define sigma0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x,7))
+#define sigma1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x,6))
+
+#define Ch(x,y,z) (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) ((x & (y ^ z)) ^ (y & z))
+#define Sigma0(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
+#define Sigma1(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
+
+#define ALIGNED __attribute((aligned(32)))
+
+#define load64(x) (*(uint64 *) (x))
+
+#define store256(x,y) (*(volatile __m256i *) (x) = (y))
+
+#define bigendian64 _mm256_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)
+
+#define PREEXPANDx4(X0,X9,X1) \
+      X0 = _mm256_add_epi64(X0, \
+            _mm256_srli_epi64(X1,1) ^ _mm256_slli_epi64(X1,63) ^ \
+            _mm256_srli_epi64(X1,8) ^ _mm256_slli_epi64(X1,56) ^ \
+            _mm256_srli_epi64(X1,7) \
+            ); \
+      X0 = _mm256_add_epi64(X0,X9);
+
+#define POSTEXPANDx4(X0,W0,W2,W14) \
+      W0 = ( \
+        _mm256_extracti128_si256(X0,0)); \
+      W0 = _mm_add_epi64(W0, \
+        _mm_srli_epi64(W14,19) ^ _mm_slli_epi64(W14,45) ^ \
+        _mm_srli_epi64(W14,61) ^ _mm_slli_epi64(W14,3) ^ \
+        _mm_srli_epi64(W14,6)); \
+      W2 = ( \
+        _mm256_extracti128_si256(X0,1)); \
+      W2 = _mm_add_epi64(W2, \
+        _mm_srli_epi64(W0,19) ^ _mm_slli_epi64(W0,45) ^ \
+        _mm_srli_epi64(W0,61) ^ _mm_slli_epi64(W0,3) ^ \
+        _mm_srli_epi64(W0,6)); \
+      X0 = _mm256_insertf128_si256(_mm256_castsi128_si256(W0),W2,1);
+
+#define ROUND0(i,r0,r1,r2,r3,r4,r5,r6,r7) \
+      r7 += load64(&wc[i]); \
+      r7 += Ch(r4,r5,r6); \
+      r7 += Sigma1(r4); \
+      r3 += r7; \
+      r7 += Maj(r2,r0,r1); \
+      r7 += Sigma0(r0); \
+
+#define ROUND1(i,r0,r1,r2,r3,r4,r5,r6,r7) \
+      r7 += load64(&wc[i]); \
+      r7 += Ch(r4,r5,r6); \
+      r7 += Sigma1(r4); \
+      r3 += r7; \
+      r7 += Maj(r0,r1,r2); \
+      r7 += Sigma0(r0); \
+
+int inner(unsigned char *statebytes,const unsigned char *in,unsigned int inlen,const uint64 *constants)
+{
+  ALIGNED uint64 state[8];
+  ALIGNED uint64 w[20];
+  ALIGNED uint64 wc[16]; /* w[i]+constants[i] */
+  uint64 r0,r1,r2,r3,r4,r5,r6,r7;
+  __m128i W0,W2,W4,W6,W8,W10,W12,W14;
+  __m256i X0,X1,X4,X5,X8,X9,X12,X13;
+  __m256i D0,D4,D8,D12;
+  int i;
+
+  state[0] = r0 = load_bigendian(statebytes);
+  state[1] = r1 = load_bigendian(statebytes+8);
+  state[2] = r2 = load_bigendian(statebytes+16);
+  state[3] = r3 = load_bigendian(statebytes+24);
+  state[4] = r4 = load_bigendian(statebytes+32);
+  state[5] = r5 = load_bigendian(statebytes+40);
+  state[6] = r6 = load_bigendian(statebytes+48);
+  state[7] = r7 = load_bigendian(statebytes+56);
+
+  do {
+        X0 = _mm256_loadu_si256((void *) (in+0));
+        X0 = _mm256_shuffle_epi8(X0,bigendian64);
+        D0 = _mm256_loadu_si256((void *) &constants[0]);
+        D0 = _mm256_add_epi64(X0,D0);
+        store256(&wc[0],D0);
+        store256(&w[0],X0);
+        store256(&w[16],X0);
+
+        X4 = _mm256_loadu_si256((void *) (in+32));
+        X4 = _mm256_shuffle_epi8(X4,bigendian64);
+        D4 = _mm256_loadu_si256((void *) &constants[4]);
+        D4 = _mm256_add_epi64(X4,D4);
+        store256(&wc[4],D4);
+        store256(&w[4],X4);
+
+      ROUND0(0,r0,r1,r2,r3,r4,r5,r6,r7)
+      ROUND1(1,r7,r0,r1,r2,r3,r4,r5,r6)
+
+        X8 = _mm256_loadu_si256((void *) (in+64));
+        X8 = _mm256_shuffle_epi8(X8,bigendian64);
+        D8 = _mm256_loadu_si256((void *) &constants[8]);
+        D8 = _mm256_add_epi64(X8,D8);
+        store256(&wc[8],D8);
+        store256(&w[8],X8);
+
+      ROUND0(2,r6,r7,r0,r1,r2,r3,r4,r5)
+      ROUND1(3,r5,r6,r7,r0,r1,r2,r3,r4)
+
+      ROUND0(4,r4,r5,r6,r7,r0,r1,r2,r3)
+      ROUND1(5,r3,r4,r5,r6,r7,r0,r1,r2)
+
+        X12 = _mm256_loadu_si256((void *) (in+96));
+        X12 = _mm256_shuffle_epi8(X12,bigendian64);
+        D12 = _mm256_loadu_si256((void *) &constants[12]);
+        D12 = _mm256_add_epi64(X12,D12);
+        store256(&wc[12],D12);
+        store256(&w[12],X12);
+
+      ROUND0(6,r2,r3,r4,r5,r6,r7,r0,r1)
+      ROUND1(7,r1,r2,r3,r4,r5,r6,r7,r0)
+
+      ROUND0(8,r0,r1,r2,r3,r4,r5,r6,r7)
+      ROUND1(9,r7,r0,r1,r2,r3,r4,r5,r6)
+
+
+    for (i = 4;i > 0;--i) {
+
+          constants += 16;
+
+          X1 = _mm256_loadu_si256((void *) (w+1));
+          X9 = _mm256_loadu_si256((void *) (w+9));
+          PREEXPANDx4(X0,X9,X1)
+
+          W14 = _mm_loadu_si128((void *) (w+14));
+          POSTEXPANDx4(X0,W0,W2,W14)
+
+          D0 = _mm256_loadu_si256((void *) &constants[0]);
+          D0 = _mm256_add_epi64(X0,D0);
+          store256(&wc[0],D0);
+          store256(w+16,X0);
+          store256(w+0,X0);
+
+      ROUND0(10,r6,r7,r0,r1,r2,r3,r4,r5)
+      ROUND1(11,r5,r6,r7,r0,r1,r2,r3,r4)
+
+      ROUND0(12,r4,r5,r6,r7,r0,r1,r2,r3)
+      ROUND1(13,r3,r4,r5,r6,r7,r0,r1,r2)
+
+          X5 = _mm256_loadu_si256((void *) (w+5));
+          X13 = _mm256_loadu_si256((void *) (w+13));
+          PREEXPANDx4(X4,X13,X5)
+
+          W2 = _mm_loadu_si128((void *) (w+2));
+          POSTEXPANDx4(X4,W4,W6,W2)
+
+          D4 = _mm256_loadu_si256((void *) &constants[4]);
+          D4 = _mm256_add_epi64(X4,D4);
+          store256(&wc[4],D4);
+          store256(w+4,X4);
+
+      ROUND0(14,r2,r3,r4,r5,r6,r7,r0,r1)
+      ROUND1(15,r1,r2,r3,r4,r5,r6,r7,r0)
+
+      ROUND0(0,r0,r1,r2,r3,r4,r5,r6,r7)
+      ROUND1(1,r7,r0,r1,r2,r3,r4,r5,r6)
+
+          X9 = _mm256_loadu_si256((void *) (w+9));
+          X1 = _mm256_loadu_si256((void *) (w+1));
+          PREEXPANDx4(X8,X1,X9)
+
+          W6 = _mm_loadu_si128((void *) (w+6));
+          POSTEXPANDx4(X8,W8,W10,W6)
+
+          D8 = _mm256_loadu_si256((void *) &constants[8]);
+          D8 = _mm256_add_epi64(X8,D8);
+          store256(&wc[8],D8);
+          store256(w+8,X8);
+
+      ROUND0(2,r6,r7,r0,r1,r2,r3,r4,r5)
+      ROUND1(3,r5,r6,r7,r0,r1,r2,r3,r4)
+
+      ROUND0(4,r4,r5,r6,r7,r0,r1,r2,r3)
+      ROUND1(5,r3,r4,r5,r6,r7,r0,r1,r2)
+
+          X13 = _mm256_loadu_si256((void *) (w+13));
+          X5 = _mm256_loadu_si256((void *) (w+5));
+          PREEXPANDx4(X12,X5,X13)
+
+          W10 = _mm_loadu_si128((void *) (w+10));
+          POSTEXPANDx4(X12,W12,W14,W10)
+
+          D12 = _mm256_loadu_si256((void *) &constants[12]);
+          D12 = _mm256_add_epi64(X12,D12);
+          store256(&wc[12],D12);
+          store256(w+12,X12);
+
+      ROUND0(6,r2,r3,r4,r5,r6,r7,r0,r1)
+      ROUND1(7,r1,r2,r3,r4,r5,r6,r7,r0)
+
+      ROUND0(8,r0,r1,r2,r3,r4,r5,r6,r7)
+      ROUND1(9,r7,r0,r1,r2,r3,r4,r5,r6)
+
+    }
+
+    {
+      ROUND0(10,r6,r7,r0,r1,r2,r3,r4,r5)
+      ROUND1(11,r5,r6,r7,r0,r1,r2,r3,r4)
+
+      ROUND0(12,r4,r5,r6,r7,r0,r1,r2,r3)
+      ROUND1(13,r3,r4,r5,r6,r7,r0,r1,r2)
+
+      ROUND0(14,r2,r3,r4,r5,r6,r7,r0,r1)
+      ROUND1(15,r1,r2,r3,r4,r5,r6,r7,r0)
+    }
+
+
+    constants -= 64;
+
+    r0 += state[0]; state[0] = r0;
+    r1 += state[1]; state[1] = r1;
+    r2 += state[2]; state[2] = r2;
+    r3 += state[3]; state[3] = r3;
+    r4 += state[4]; state[4] = r4;
+    r5 += state[5]; state[5] = r5;
+    r6 += state[6]; state[6] = r6;
+    r7 += state[7]; state[7] = r7;
+
+    in += 128;
+    inlen -= 128;
+  } while (inlen >= 128);
+
+  for (i = 0;i < 8;++i)
+    store_bigendian(statebytes+8*i,state[i]);
+
+  return inlen;
+}
@@ -0,0 +1,10 @@
+#ifndef inner_h
+#define inner_h
+
+#define inner CRYPTO_NAMESPACE(inner)
+
+#include "crypto_uint64.h"
+
+extern int inner(unsigned char *,const unsigned char *,unsigned int,const crypto_uint64 *);
+
+#endif
@@ -0,0 +1,68 @@
+inner.S: \
+round01.q \
+round23.q \
+round45.q \
+round67.q \
+round89.q \
+round1011.q \
+round1213.q \
+round1415.q \
+expand0.q \
+expand4.q \
+expand8.q \
+expand12.q \
+rer0.q \
+rer4.q \
+rer8.q \
+rer12.q \
+inner.q \
+inner.S.do
+	./inner.S.do < inner.q > inner.S
+
+round01.q: round.py
+	./round.py 0 > round01.q
+
+round23.q: round.py
+	./round.py 1 > round23.q
+
+round45.q: round.py
+	./round.py 2 > round45.q
+
+round67.q: round.py
+	./round.py 3 > round67.q
+
+round89.q: round.py
+	./round.py 4 > round89.q
+
+round1011.q: round.py
+	./round.py 5 > round1011.q
+
+round1213.q: round.py
+	./round.py 6 > round1213.q
+
+round1415.q: round.py
+	./round.py 7 > round1415.q
+
+expand0.q: expand.py
+	./expand.py 0 > expand0.q
+
+expand4.q: expand.py
+	./expand.py 4 > expand4.q
+
+expand8.q: expand.py
+	./expand.py 8 > expand8.q
+
+expand12.q: expand.py
+	./expand.py 12 > expand12.q
+
+rer0.q: rer.py
+	./rer.py 0 > rer0.q
+
+rer4.q: rer.py
+	./rer.py 4 > rer4.q
+
+rer8.q: rer.py
+	./rer.py 8 > rer8.q
+
+rer12.q: rer.py
+	./rer.py 12 > rer12.q
@@ -0,0 +1 @@
+../m3/api.h
@@ -0,0 +1,2 @@
+amd64 bmi2 avx2
+x86 bmi2 avx2
@@ -0,0 +1,104 @@
+#include "crypto_hashblocks.h"
+#include "inner.h"
+
+static const crypto_uint64 constants[84] = {
+  0x428a2f98d728ae22ULL
+, 0x7137449123ef65cdULL
+, 0xb5c0fbcfec4d3b2fULL
+, 0xe9b5dba58189dbbcULL
+, 0x3956c25bf348b538ULL
+, 0x59f111f1b605d019ULL
+, 0x923f82a4af194f9bULL
+, 0xab1c5ed5da6d8118ULL
+, 0xd807aa98a3030242ULL
+, 0x12835b0145706fbeULL
+, 0x243185be4ee4b28cULL
+, 0x550c7dc3d5ffb4e2ULL
+, 0x72be5d74f27b896fULL
+, 0x80deb1fe3b1696b1ULL
+, 0x9bdc06a725c71235ULL
+, 0xc19bf174cf692694ULL
+, 0xe49b69c19ef14ad2ULL
+, 0xefbe4786384f25e3ULL
+, 0x0fc19dc68b8cd5b5ULL
+, 0x240ca1cc77ac9c65ULL
+, 0x2de92c6f592b0275ULL
+, 0x4a7484aa6ea6e483ULL
+, 0x5cb0a9dcbd41fbd4ULL
+, 0x76f988da831153b5ULL
+, 0x983e5152ee66dfabULL
+, 0xa831c66d2db43210ULL
+, 0xb00327c898fb213fULL
+, 0xbf597fc7beef0ee4ULL
+, 0xc6e00bf33da88fc2ULL
+, 0xd5a79147930aa725ULL
+, 0x06ca6351e003826fULL
+, 0x142929670a0e6e70ULL
+, 0x27b70a8546d22ffcULL
+, 0x2e1b21385c26c926ULL
+, 0x4d2c6dfc5ac42aedULL
+, 0x53380d139d95b3dfULL
+, 0x650a73548baf63deULL
+, 0x766a0abb3c77b2a8ULL
+, 0x81c2c92e47edaee6ULL
+, 0x92722c851482353bULL
+, 0xa2bfe8a14cf10364ULL
+, 0xa81a664bbc423001ULL
+, 0xc24b8b70d0f89791ULL
+, 0xc76c51a30654be30ULL
+, 0xd192e819d6ef5218ULL
+, 0xd69906245565a910ULL
+, 0xf40e35855771202aULL
+, 0x106aa07032bbd1b8ULL
+, 0x19a4c116b8d2d0c8ULL
+, 0x1e376c085141ab53ULL
+, 0x2748774cdf8eeb99ULL
+, 0x34b0bcb5e19b48a8ULL
+, 0x391c0cb3c5c95a63ULL
+, 0x4ed8aa4ae3418acbULL
+, 0x5b9cca4f7763e373ULL
+, 0x682e6ff3d6b2b8a3ULL
+, 0x748f82ee5defb2fcULL
+, 0x78a5636f43172f60ULL
+, 0x84c87814a1f0ab72ULL
+, 0x8cc702081a6439ecULL
+, 0x90befffa23631e28ULL
+, 0xa4506cebde82bde9ULL
+, 0xbef9a3f7b2c67915ULL
+, 0xc67178f2e372532bULL
+, 0xca273eceea26619cULL
+, 0xd186b8c721c0c207ULL
+, 0xeada7dd6cde0eb1eULL
+, 0xf57d4f7fee6ed178ULL
+, 0x06f067aa72176fbaULL
+, 0x0a637dc5a2c898a6ULL
+, 0x113f9804bef90daeULL
+, 0x1b710b35131c471bULL
+, 0x28db77f523047d84ULL
+, 0x32caab7b40c72493ULL
+, 0x3c9ebe0a15c9bebcULL
+, 0x431d67c49c100d4cULL
+, 0x4cc5d4becb3e42b6ULL
+, 0x597f299cfc657e2aULL
+, 0x5fcb6fab3ad6faecULL
+, 0x6c44198c4a475817ULL
+
+/* constants for the AVX code: */
+, 0x0001020304050607ULL
+, 0x08090a0b0c0d0e0fULL
+, 0x0001020304050607ULL
+, 0x08090a0b0c0d0e0fULL
+};
+
+#define CUTOFF 65536 /* must be multiple of 128 */
+
+int crypto_hashblocks(unsigned char *statebytes,const unsigned char *in,long long inlen)
+{
+  while (inlen >= CUTOFF) {
+    inner(statebytes,in,CUTOFF,constants); /* returns 0 */
+    in += CUTOFF;
+    inlen -= CUTOFF;
+  }
+  if (inlen < 128) return inlen;
+  return inner(statebytes,in,inlen,constants);
+}
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+import sys
+
+for i in (0,4,8,12):
+  if len(sys.argv) > 1:
+    if sys.argv[1] != str(i):
+      continue
+
+  i0 = (i+0)&15
+  i1 = (i+1)&15
+  i9 = (i+9)&15
+  i14 = (i+14)&15
+
+  print('            X%d = mem256[&w + %d]' % (i1,8*i1))
+  print('            W%d = mem128[&w + %d],0' % (i14,8*i14))
+  print('')
+  print('            4x X%dright1 = X%d unsigned>> 1' % (i1,i1))
+  print('            4x X%dleft63 = X%d << 63' % (i1,i1))
+  print('            X%dsigma0 = X%dright1 ^ X%dleft63' % (i1,i1,i1))
+  print('            4x X%dright8 = X%d unsigned>> 8' % (i1,i1))
+  print('            X%dsigma0 = X%dsigma0 ^ X%dright8' % (i1,i1,i1))
+  print('                2x,0 W%dright19 = W%d unsigned>> 19' % (i14,i14))
+  print('            4x X%dleft56 = X%d << 56' % (i1,i1))
+  print('                2x,0 W%dleft45 = W%d << 45' % (i14,i14))
+  print('            X%dsigma0 = X%dsigma0 ^ X%dleft56' % (i1,i1,i1))
+  print('                1x,0 W%dsigma1 = W%dright19 ^ W%dleft45' % (i14,i14,i14))
+  print('            4x X%dright7 = X%d unsigned>> 7' % (i1,i1))
+  print('                2x,0 W%dright61 = W%d unsigned>> 61' % (i14,i14))
+  print('            X%dsigma0 = X%dsigma0 ^ X%dright7' % (i1,i1,i1))
+  print('                1x,0 W%dsigma1 ^= W%dright61' % (i14,i14))
+  print('            4x X%d = X%d + X%dsigma0' % (i0,i0,i1))
+  print('                2x,0 W%dleft3 = W%d << 3' % (i14,i14))
+  print('            4x X%d = X%d + mem256[&w + %d]' % (i0,i0,8*i9))
+  print('                1x,0 W%dsigma1 ^= W%dleft3' % (i14,i14))
+  print('                2x,0 W%dright6 = W%d unsigned>> 6' % (i14,i14))
+  print('                1x,0 W%dsigma1 ^= W%dright6' % (i14,i14))
+  print('            4x X%d = W%dsigma1 + X%d' % (i0,i14,i0))
+  print('')
+  print('            2x,0 W%dright19 = X%d unsigned>> 19' % (i0,i0))
+  print('            2x,0 W%dleft45 = X%d << 45' % (i0,i0))
+  print('            1x,0 W%dsigma1 = W%dright19 ^ W%dleft45' % (i0,i0,i0))
+  print('            2x,0 W%dright61 = X%d unsigned>> 61' % (i0,i0))
+  print('            1x,0 W%dsigma1 ^= W%dright61' % (i0,i0))
+  print('            2x,0 W%dleft3 = X%d << 3' % (i0,i0))
+  print('            1x,0 W%dsigma1 ^= W%dleft3' % (i0,i0))
+  print('            2x,0 W%dright6 = X%d unsigned>> 6' % (i0,i0))
+  print('            1x,0 W%dsigma1 ^= W%dright6' % (i0,i0))
+  print('            W%dsigma1 = W%dsigma1[1],W%dsigma1[0]' % (i0,i0,i0))
+  print('')
+  print('            4x X%d = X%d + W%dsigma1' % (i0,i0,i0))
+  if i == 0:
+    print('            mem256[&w + 128] = X%d' % (i0))
+  print('            mem256[&w + %d] = X%d' % (8*i0,i0))
+  print('            4x D%d = X%d + mem256[constants + %d]' % (i0,i0,8*i0))
+  print('            wc%d%d%d%d = D%d' % (i,i+1,i+2,i+3,i0))
+  print('')
@@ -0,0 +1,42 @@
+            X1 = mem256[&w + 8]
+            W14 = mem128[&w + 112],0
+
+            4x X1right1 = X1 unsigned>> 1
+            4x X1left63 = X1 << 63
+            X1sigma0 = X1right1 ^ X1left63
+            4x X1right8 = X1 unsigned>> 8
+            X1sigma0 = X1sigma0 ^ X1right8
+                2x,0 W14right19 = W14 unsigned>> 19
+            4x X1left56 = X1 << 56
+                2x,0 W14left45 = W14 << 45
+            X1sigma0 = X1sigma0 ^ X1left56
+                1x,0 W14sigma1 = W14right19 ^ W14left45
+            4x X1right7 = X1 unsigned>> 7
+                2x,0 W14right61 = W14 unsigned>> 61
+            X1sigma0 = X1sigma0 ^ X1right7
+                1x,0 W14sigma1 ^= W14right61
+            4x X0 = X0 + X1sigma0
+                2x,0 W14left3 = W14 << 3
+            4x X0 = X0 + mem256[&w + 72]
+                1x,0 W14sigma1 ^= W14left3
+                2x,0 W14right6 = W14 unsigned>> 6
+                1x,0 W14sigma1 ^= W14right6
+            4x X0 = W14sigma1 + X0
+
+            2x,0 W0right19 = X0 unsigned>> 19
+            2x,0 W0left45 = X0 << 45
+            1x,0 W0sigma1 = W0right19 ^ W0left45
+            2x,0 W0right61 = X0 unsigned>> 61
+            1x,0 W0sigma1 ^= W0right61
+            2x,0 W0left3 = X0 << 3
+            1x,0 W0sigma1 ^= W0left3
+            2x,0 W0right6 = X0 unsigned>> 6
+            1x,0 W0sigma1 ^= W0right6
+            W0sigma1 = W0sigma1[1],W0sigma1[0]
+
+            4x X0 = X0 + W0sigma1
+            mem256[&w + 128] = X0
+            mem256[&w + 0] = X0
+            4x D0 = X0 + mem256[constants + 0]
+            wc0123 = D0
+
@@ -0,0 +1,41 @@
+            X13 = mem256[&w + 104]
+            W10 = mem128[&w + 80],0
+
+            4x X13right1 = X13 unsigned>> 1
+            4x X13left63 = X13 << 63
+            X13sigma0 = X13right1 ^ X13left63
+            4x X13right8 = X13 unsigned>> 8
+            X13sigma0 = X13sigma0 ^ X13right8
+                2x,0 W10right19 = W10 unsigned>> 19
+            4x X13left56 = X13 << 56
+                2x,0 W10left45 = W10 << 45
+            X13sigma0 = X13sigma0 ^ X13left56
+                1x,0 W10sigma1 = W10right19 ^ W10left45
+            4x X13right7 = X13 unsigned>> 7
+                2x,0 W10right61 = W10 unsigned>> 61
+            X13sigma0 = X13sigma0 ^ X13right7
+                1x,0 W10sigma1 ^= W10right61
+            4x X12 = X12 + X13sigma0
+                2x,0 W10left3 = W10 << 3
+            4x X12 = X12 + mem256[&w + 40]
+                1x,0 W10sigma1 ^= W10left3
+                2x,0 W10right6 = W10 unsigned>> 6
+                1x,0 W10sigma1 ^= W10right6
+            4x X12 = W10sigma1 + X12
+
+            2x,0 W12right19 = X12 unsigned>> 19
+            2x,0 W12left45 = X12 << 45
+            1x,0 W12sigma1 = W12right19 ^ W12left45
+            2x,0 W12right61 = X12 unsigned>> 61
+            1x,0 W12sigma1 ^= W12right61
+            2x,0 W12left3 = X12 << 3
+            1x,0 W12sigma1 ^= W12left3
+            2x,0 W12right6 = X12 unsigned>> 6
+            1x,0 W12sigma1 ^= W12right6
+            W12sigma1 = W12sigma1[1],W12sigma1[0]
+
+            4x X12 = X12 + W12sigma1
+            mem256[&w + 96] = X12
+            4x D12 = X12 + mem256[constants + 96]
+            wc12131415 = D12
+
@@ -0,0 +1,41 @@
+            X5 = mem256[&w + 40]
+            W2 = mem128[&w + 16],0
+
+            4x X5right1 = X5 unsigned>> 1
+            4x X5left63 = X5 << 63
+            X5sigma0 = X5right1 ^ X5left63
+            4x X5right8 = X5 unsigned>> 8
+            X5sigma0 = X5sigma0 ^ X5right8
+                2x,0 W2right19 = W2 unsigned>> 19
+            4x X5left56 = X5 << 56
+                2x,0 W2left45 = W2 << 45
+            X5sigma0 = X5sigma0 ^ X5left56
+                1x,0 W2sigma1 = W2right19 ^ W2left45
+            4x X5right7 = X5 unsigned>> 7
+                2x,0 W2right61 = W2 unsigned>> 61
+            X5sigma0 = X5sigma0 ^ X5right7
+                1x,0 W2sigma1 ^= W2right61
+            4x X4 = X4 + X5sigma0
+                2x,0 W2left3 = W2 << 3
+            4x X4 = X4 + mem256[&w + 104]
+                1x,0 W2sigma1 ^= W2left3
+                2x,0 W2right6 = W2 unsigned>> 6
+                1x,0 W2sigma1 ^= W2right6
+            4x X4 = W2sigma1 + X4
+
+            2x,0 W4right19 = X4 unsigned>> 19
+            2x,0 W4left45 = X4 << 45
+            1x,0 W4sigma1 = W4right19 ^ W4left45
+            2x,0 W4right61 = X4 unsigned>> 61
+            1x,0 W4sigma1 ^= W4right61
+            2x,0 W4left3 = X4 << 3
+            1x,0 W4sigma1 ^= W4left3
+            2x,0 W4right6 = X4 unsigned>> 6
+            1x,0 W4sigma1 ^= W4right6
+            W4sigma1 = W4sigma1[1],W4sigma1[0]
+
+            4x X4 = X4 + W4sigma1
+            mem256[&w + 32] = X4
+            4x D4 = X4 + mem256[constants + 32]
+            wc4567 = D4
+
@@ -0,0 +1,41 @@
+            X9 = mem256[&w + 72]
+            W6 = mem128[&w + 48],0
+
+            4x X9right1 = X9 unsigned>> 1
+            4x X9left63 = X9 << 63
+            X9sigma0 = X9right1 ^ X9left63
+            4x X9right8 = X9 unsigned>> 8
+            X9sigma0 = X9sigma0 ^ X9right8
+                2x,0 W6right19 = W6 unsigned>> 19
+            4x X9left56 = X9 << 56
+                2x,0 W6left45 = W6 << 45
+            X9sigma0 = X9sigma0 ^ X9left56
+                1x,0 W6sigma1 = W6right19 ^ W6left45
+            4x X9right7 = X9 unsigned>> 7
+                2x,0 W6right61 = W6 unsigned>> 61
+            X9sigma0 = X9sigma0 ^ X9right7
+                1x,0 W6sigma1 ^= W6right61
+            4x X8 = X8 + X9sigma0
+                2x,0 W6left3 = W6 << 3
+            4x X8 = X8 + mem256[&w + 8]
+                1x,0 W6sigma1 ^= W6left3
+                2x,0 W6right6 = W6 unsigned>> 6
+                1x,0 W6sigma1 ^= W6right6
+            4x X8 = W6sigma1 + X8
+
+            2x,0 W8right19 = X8 unsigned>> 19
+            2x,0 W8left45 = X8 << 45
+            1x,0 W8sigma1 = W8right19 ^ W8left45
+            2x,0 W8right61 = X8 unsigned>> 61
+            1x,0 W8sigma1 ^= W8right61
+            2x,0 W8left3 = X8 << 3
+            1x,0 W8sigma1 ^= W8left3
+            2x,0 W8right6 = X8 unsigned>> 6
+            1x,0 W8sigma1 ^= W8right6
+            W8sigma1 = W8sigma1[1],W8sigma1[0]
+
+            4x X8 = X8 + W8sigma1
+            mem256[&w + 64] = X8
+            4x D8 = X8 + mem256[constants + 64]
+            wc891011 = D8
+
@@ -0,0 +1 @@
+../m3/implementors
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+cpp \
+| qhasm-amd64avx \
+| sed 's/\<inner\>/CRYPTO_SHARED_NAMESPACE(inner)/' \
+| sed 's/\<_inner\>/_CRYPTO_SHARED_NAMESPACE(inner)/' \
+| sed 's/^\.p2align 5/.p2align 7/' \
+| awk '{
+  found = 0
+  if (!found && $0 == "and $31,%r11") {
+    found = 1
+    $0 = "and $511,%r11"
+  }
+  print
+}'
@@ -0,0 +1,10 @@
+#ifndef inner_h
+#define inner_h
+
+#define inner CRYPTO_SHARED_NAMESPACE(inner)
+
+#include "crypto_uint64.h"
+
+extern int inner(unsigned char *,const unsigned char *,unsigned long long,const crypto_uint64 *);
+
+#endif
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+
+import sys
+
+j = int(sys.argv[1])
+
+j0 = (j+0)&15
+j1 = (j+1)&15
+j2 = (j+2)&15
+j3 = (j+3)&15
+j9 = (j+9)&15
+j14 = (j+14)&15
+
+k0 = (j+8)&15
+k1 = (j+9)&15
+k2 = (j+10)&15
+k3 = (j+11)&15
+
+wcin = 'wc%d%d%d%d' % (k0,k1,k2,k3)
+wcout = 'wc%d%d%d%d' % (j0,j1,j2,j3)
+
+i0 = (j+0)&7
+i1 = (j+1)&7
+i2 = (j+2)&7
+i3 = (j+3)&7
+i4 = (j+4)&7
+i5 = (j+5)&7
+i6 = (j+6)&7
+i7 = (j+7)&7
+
+print('            X%d = mem256[&w + %d]' % (j1,8*j1))
+print('            4x X%dright1 = X%d unsigned>> 1' % (j1,j1))
+print('      r%dSigma1 = r%d>>>14' % (i4,i4))
+print('    r%d += %s[0]' % (i7,wcin))
+print('    ch%d = r%d' % (i7,i6))
+print('    ch%d ^= r%d' % (i7,i5))
+print('')
+print('            4x X%dleft63 = X%d << 63' % (j1,j1))
+print('      r%d18 = r%d>>>18' % (i4,i4))
+print('    ch%d &= r%d' % (i7,i4))
+print('  maj%d = r%d' % (i6,i1))
+print('  maj%d ^= r%d' % (i6,i0))
+print('')
+print('            X%dsigma0 = X%dright1 ^ X%dleft63' % (j1,j1,j1))
+print('      r%d41 = r%d>>>41' % (i4,i4))
+print('      r%dSigma1 ^= r%d18' % (i4,i4))
+print('    ch%d ^= r%d' % (i7,i6))
+print('')
+print('            4x X%dright8 = X%d unsigned>> 8' % (j1,j1))
+print('      r%dSigma1 ^= r%d41' % (i4,i4))
+print('      r%dSigma0 = r%d>>>28' % (i0,i0))
+print('    r%d += ch%d' % (i7,i7))
+print('')
+print('            X%dsigma0 = X%dsigma0 ^ X%dright8' % (j1,j1,j1))
+print('      r%d34 = r%d>>>34' % (i0,i0))
+print('      r%d += r%dSigma1' % (i7,i4))
+print('  maj%d = r%d' % (i7,i2))
+print('  maj%d &= maj%d' % (i7,i6))
+print('')
+print('            W%d = mem128[&w + %d],0' % (j14,8*j14))
+print('            2x,0 W%dright19 = W%d unsigned>> 19' % (j14,j14))
+print('      r%dSigma0 ^= r%d34' % (i0,i0))
+print('      r%d39 = r%d>>>39' % (i0,i0))
+print('  r%d += r%d' % (i3,i7))
+print('')
+print('            4x X%dleft56 = X%d << 56' % (j1,j1))
+print('      r%dSigma0 ^= r%d39' % (i0,i0))
+print('    r%d += %s[1]' % (i6,wcin))
+print('  r%dandr%d = r%d' % (i0,i1,i1))
+print('  r%dandr%d &= r%d' % (i0,i1,i0))
+print('')
+print('            2x,0 W%dleft45 = W%d << 45' % (j14,j14))
+print('      r%d += r%dSigma0' % (i7,i0))
+print('  maj%d ^= r%dandr%d' % (i7,i0,i1))
+print('    ch%d = r%d' % (i6,i5))
+print('    ch%d ^= r%d' % (i6,i4))
+print('')
+print('            X%dsigma0 = X%dsigma0 ^ X%dleft56' % (j1,j1,j1))
+print('            2x,0 W%dright61 = W%d unsigned>> 61' % (j14,j14))
+print('      r%dSigma1 = r%d>>>14' % (i3,i3))
+print('  r%d += maj%d' % (i7,i7))
+print('')
+print('            4x X%dright7 = X%d unsigned>> 7' % (j1,j1))
+print('            1x,0 W%dsigma1 = W%dright19 ^ W%dleft45' % (j14,j14,j14))
+print('      r%d18 = r%d>>>18' % (i3,i3))
+print('    ch%d &= r%d' % (i6,i3))
+print('')
+print('            X%dsigma0 = X%dsigma0 ^ X%dright7' % (j1,j1,j1))
+print('      r%dSigma1 ^= r%d18' % (i3,i3))
+print('      r%d41 = r%d>>>41' % (i3,i3))
+print('  maj%d &= r%d' % (i6,i7))
+print('')
+print('            1x,0 W%dsigma1 ^= W%dright61' % (j14,j14))
+print('            4x X%d = X%d + mem256[&w + %d]' % (j0,j0,8*j9))
+print('      r%dSigma1 ^= r%d41' % (i3,i3))
+print('  maj%d ^= r%dandr%d' % (i6,i0,i1))
+print('')
+print('            2x,0 W%dleft3 = W%d << 3' % (j14,j14))
+print('      r%dSigma0 = r%d>>>28' % (i7,i7))
+print('    ch%d ^= r%d' % (i6,i5))
+print('      r%d += r%dSigma1' % (i6,i3))
+print('')
+print('            4x X%d = X%d + X%dsigma0' % (j0,j0,j1))
+print('      r%d34 = r%d>>>34' % (i7,i7))
+print('    r%d += %s[2]' % (i5,wcin))
+print('    r%d += ch%d' % (i6,i6))
+print('')
+print('            1x,0 W%dsigma1 ^= W%dleft3' % (j14,j14))
+print('      r%dSigma0 ^= r%d34' % (i7,i7))
+print('      r%d39 = r%d>>>39' % (i7,i7))
+print('  r%d += r%d' % (i2,i6))
+print('')
+print('            2x,0 W%dright6 = W%d unsigned>> 6' % (j14,j14))
+print('      r%dSigma0 ^= r%d39' % (i7,i7))
+print('  r%d += maj%d' % (i6,i6))
+print('    ch%d = r%d' % (i5,i4))
+print('    ch%d ^= r%d' % (i5,i3))
+print('')
+print('            1x,0 W%dsigma1 ^= W%dright6' % (j14,j14))
+print('      r%d += r%dSigma0' % (i6,i7))
+print('      r%dSigma1 = r%d>>>14' % (i2,i2))
+print('    ch%d &= r%d' % (i5,i2))
+print('')
+print('            4x X%d = W%dsigma1 + X%d' % (j0,j14,j0))
+print('      r%d18 = r%d>>>18' % (i2,i2))
+print('      r%d41 = r%d>>>41' % (i2,i2))
+print('    ch%d ^= r%d' % (i5,i4))
+print('')
+print('            2x,0 W%dright19 = X%d unsigned>> 19' % (j0,j0))
+print('      r%dSigma1 ^= r%d18' % (i2,i2))
+print('      r%dSigma0 = r%d>>>28' % (i6,i6))
+print('    r%d += ch%d' % (i5,i5))
+print('')
+print('            2x,0 W%dleft45 = X%d << 45' % (j0,j0))
+print('      r%dSigma1 ^= r%d41' % (i2,i2))
+print('      r%d34 = r%d>>>34' % (i6,i6))
+print('  maj%d = r%d' % (i4,i7))
+print('  maj%d ^= r%d' % (i4,i6))
+print('')
+print('            2x,0 W%dright61 = X%d unsigned>> 61' % (j0,j0))
+print('            1x,0 W%dsigma1 = W%dright19 ^ W%dleft45' % (j0,j0,j0))
+print('      r%dSigma0 ^= r%d34' % (i6,i6))
+print('      r%d39 = r%d>>>39' % (i6,i6))
+print('')
+print('            2x,0 W%dleft3 = X%d << 3' % (j0,j0))
+print('            1x,0 W%dsigma1 ^= W%dright61' % (j0,j0))
+print('      r%dSigma0 ^= r%d39' % (i6,i6))
+print('      r%d += r%dSigma1' % (i5,i2))
+print('')
+print('            2x,0 W%dright6 = X%d unsigned>> 6' % (j0,j0))
+print('            1x,0 W%dsigma1 ^= W%dleft3' % (j0,j0))
+print('  r%d += r%d' % (i1,i5))
+print('  r%dandr%d = r%d' % (i6,i7,i7))
+print('  r%dandr%d &= r%d' % (i6,i7,i6))
+print('')
+print('            1x,0 W%dsigma1 ^= W%dright6' % (j0,j0))
+print('      r%dSigma1 = r%d>>>14' % (i1,i1))
+print('      r%d += r%dSigma0' % (i5,i6))
+print('  maj%d = r%d' % (i5,i0))
+print('  maj%d &= maj%d' % (i5,i4))
+print('')
+print('            W%dsigma1 = W%dsigma1[1],W%dsigma1[0]' % (j0,j0,j0))
+print('  maj%d ^= r%dandr%d' % (i5,i6,i7))
+print('    ch%d = r%d' % (i4,i3))
+print('    ch%d ^= r%d' % (i4,i2))
+print('')
+print('  r%d += maj%d' % (i5,i5))
+print('    ch%d &= r%d' % (i4,i1))
+print('      r%d18 = r%d>>>18' % (i1,i1))
+print('')
+print('  maj%d &= r%d' % (i4,i5))
+print('    ch%d ^= r%d' % (i4,i3))
+print('    r%d += %s[3]' % (i4,wcin))
+print('      r%dSigma1 ^= r%d18' % (i1,i1))
+print('')
+print('      r%d41 = r%d>>>41' % (i1,i1))
+print('            4x X%d = X%d + W%dsigma1' % (j0,j0,j0))
+if j0 == 0:
+  print('            mem256[&w + 128] = X%d' % (j0))
+print('            mem256[&w + %d] = X%d' % (8*j0,j0))
+print('    r%d += ch%d' % (i4,i4))
+print('  maj%d ^= r%dandr%d' % (i4,i6,i7))
+print('')
+print('      r%dSigma0 = r%d>>>28' % (i5,i5))
+print('            4x D%d = X%d + mem256[constants + %d]' % (j0,j0,8*j0))
+print('            %s = D%d' % (wcout,j0))
+print('      r%d34 = r%d>>>34' % (i5,i5))
+print('      r%dSigma1 ^= r%d41' % (i1,i1))
+print('')
+print('      r%d += r%dSigma1' % (i4,i1))
+print('      r%dSigma0 ^= r%d34' % (i5,i5))
+print('      r%d39 = r%d>>>39' % (i5,i5))
+print('')
+print('  r%d += r%d' % (i0,i4))
+print('  r%d += maj%d' % (i4,i4))
+print('      r%dSigma0 ^= r%d39' % (i5,i5))
+print('')
+print('      r%d += r%dSigma0' % (i4,i5))
@@ -0,0 +1,167 @@
+            X1 = mem256[&w + 8]
+            4x X1right1 = X1 unsigned>> 1
+      r4Sigma1 = r4>>>14
+    r7 += wc891011[0]
+    ch7 = r6
+    ch7 ^= r5
+
+            4x X1left63 = X1 << 63
+      r418 = r4>>>18
+    ch7 &= r4
+  maj6 = r1
+  maj6 ^= r0
+
+            X1sigma0 = X1right1 ^ X1left63
+      r441 = r4>>>41
+      r4Sigma1 ^= r418
+    ch7 ^= r6
+
+            4x X1right8 = X1 unsigned>> 8
+      r4Sigma1 ^= r441
+      r0Sigma0 = r0>>>28
+    r7 += ch7
+
+            X1sigma0 = X1sigma0 ^ X1right8
+      r034 = r0>>>34
+      r7 += r4Sigma1
+  maj7 = r2
+  maj7 &= maj6
+
+            W14 = mem128[&w + 112],0
+            2x,0 W14right19 = W14 unsigned>> 19
+      r0Sigma0 ^= r034
+      r039 = r0>>>39
+  r3 += r7
+
+            4x X1left56 = X1 << 56
+      r0Sigma0 ^= r039
+    r6 += wc891011[1]
+  r0andr1 = r1
+  r0andr1 &= r0
+
+            2x,0 W14left45 = W14 << 45
+      r7 += r0Sigma0
+  maj7 ^= r0andr1
+    ch6 = r5
+    ch6 ^= r4
+
+            X1sigma0 = X1sigma0 ^ X1left56
+            2x,0 W14right61 = W14 unsigned>> 61
+      r3Sigma1 = r3>>>14
+  r7 += maj7
+
+            4x X1right7 = X1 unsigned>> 7
+            1x,0 W14sigma1 = W14right19 ^ W14left45
+      r318 = r3>>>18
+    ch6 &= r3
+
+            X1sigma0 = X1sigma0 ^ X1right7
+      r3Sigma1 ^= r318
+      r341 = r3>>>41
+  maj6 &= r7
+
+            1x,0 W14sigma1 ^= W14right61
+            4x X0 = X0 + mem256[&w + 72]
+      r3Sigma1 ^= r341
+  maj6 ^= r0andr1
+
+            2x,0 W14left3 = W14 << 3
+      r7Sigma0 = r7>>>28
+    ch6 ^= r5
+      r6 += r3Sigma1
+
+            4x X0 = X0 + X1sigma0
+      r734 = r7>>>34
+    r5 += wc891011[2]
+    r6 += ch6
+
+            1x,0 W14sigma1 ^= W14left3
+      r7Sigma0 ^= r734
+      r739 = r7>>>39
+  r2 += r6
+
+            2x,0 W14right6 = W14 unsigned>> 6
+      r7Sigma0 ^= r739
+  r6 += maj6
+    ch5 = r4
+    ch5 ^= r3
+
+            1x,0 W14sigma1 ^= W14right6
+      r6 += r7Sigma0
+      r2Sigma1 = r2>>>14
+    ch5 &= r2
+
+            4x X0 = W14sigma1 + X0
+      r218 = r2>>>18
+      r241 = r2>>>41
+    ch5 ^= r4
+
+            2x,0 W0right19 = X0 unsigned>> 19
+      r2Sigma1 ^= r218
+      r6Sigma0 = r6>>>28
+    r5 += ch5
+
+            2x,0 W0left45 = X0 << 45
+      r2Sigma1 ^= r241
+      r634 = r6>>>34
+  maj4 = r7
+  maj4 ^= r6
+
+            2x,0 W0right61 = X0 unsigned>> 61
+            1x,0 W0sigma1 = W0right19 ^ W0left45
+      r6Sigma0 ^= r634
+      r639 = r6>>>39
+
+            2x,0 W0left3 = X0 << 3
+            1x,0 W0sigma1 ^= W0right61
+      r6Sigma0 ^= r639
+      r5 += r2Sigma1
+
+            2x,0 W0right6 = X0 unsigned>> 6
+            1x,0 W0sigma1 ^= W0left3
+  r1 += r5
+  r6andr7 = r7
+  r6andr7 &= r6
+
+            1x,0 W0sigma1 ^= W0right6
+      r1Sigma1 = r1>>>14
+      r5 += r6Sigma0
+  maj5 = r0
+  maj5 &= maj4
+
+            W0sigma1 = W0sigma1[1],W0sigma1[0]
+  maj5 ^= r6andr7
+    ch4 = r3
+    ch4 ^= r2
+
+  r5 += maj5
+    ch4 &= r1
+      r118 = r1>>>18
+
+  maj4 &= r5
+    ch4 ^= r3
+    r4 += wc891011[3]
+      r1Sigma1 ^= r118
+
+      r141 = r1>>>41
+            4x X0 = X0 + W0sigma1
+            mem256[&w + 128] = X0
+            mem256[&w + 0] = X0
+    r4 += ch4
+  maj4 ^= r6andr7
+
+      r5Sigma0 = r5>>>28
+            4x D0 = X0 + mem256[constants + 0]
+            wc0123 = D0
+      r534 = r5>>>34
+      r1Sigma1 ^= r141
+
+      r4 += r1Sigma1
+      r5Sigma0 ^= r534
+      r539 = r5>>>39
+
+  r0 += r4
+  r4 += maj4
+      r5Sigma0 ^= r539
+
+      r4 += r5Sigma0
@@ -0,0 +1,166 @@
+            X13 = mem256[&w + 104]
+            4x X13right1 = X13 unsigned>> 1
+      r0Sigma1 = r0>>>14
+    r3 += wc4567[0]
+    ch3 = r2
+    ch3 ^= r1
+
+            4x X13left63 = X13 << 63
+      r018 = r0>>>18
+    ch3 &= r0
+  maj2 = r5
+  maj2 ^= r4
+
+            X13sigma0 = X13right1 ^ X13left63
+      r041 = r0>>>41
+      r0Sigma1 ^= r018
+    ch3 ^= r2
+
+            4x X13right8 = X13 unsigned>> 8
+      r0Sigma1 ^= r041
+      r4Sigma0 = r4>>>28
+    r3 += ch3
+
+            X13sigma0 = X13sigma0 ^ X13right8
+      r434 = r4>>>34
+      r3 += r0Sigma1
+  maj3 = r6
+  maj3 &= maj2
+
+            W10 = mem128[&w + 80],0
+            2x,0 W10right19 = W10 unsigned>> 19
+      r4Sigma0 ^= r434
+      r439 = r4>>>39
+  r7 += r3
+
+            4x X13left56 = X13 << 56
+      r4Sigma0 ^= r439
+    r2 += wc4567[1]
+  r4andr5 = r5
+  r4andr5 &= r4
+
+            2x,0 W10left45 = W10 << 45
+      r3 += r4Sigma0
+  maj3 ^= r4andr5
+    ch2 = r1
+    ch2 ^= r0
+
+            X13sigma0 = X13sigma0 ^ X13left56
+            2x,0 W10right61 = W10 unsigned>> 61
+      r7Sigma1 = r7>>>14
+  r3 += maj3
+
+            4x X13right7 = X13 unsigned>> 7
+            1x,0 W10sigma1 = W10right19 ^ W10left45
+      r718 = r7>>>18
+    ch2 &= r7
+
+            X13sigma0 = X13sigma0 ^ X13right7
+      r7Sigma1 ^= r718
+      r741 = r7>>>41
+  maj2 &= r3
+
+            1x,0 W10sigma1 ^= W10right61
+            4x X12 = X12 + mem256[&w + 40]
+      r7Sigma1 ^= r741
+  maj2 ^= r4andr5
+
+            2x,0 W10left3 = W10 << 3
+      r3Sigma0 = r3>>>28
+    ch2 ^= r1
+      r2 += r7Sigma1
+
+            4x X12 = X12 + X13sigma0
+      r334 = r3>>>34
+    r1 += wc4567[2]
+    r2 += ch2
+
+            1x,0 W10sigma1 ^= W10left3
+      r3Sigma0 ^= r334
+      r339 = r3>>>39
+  r6 += r2
+
+            2x,0 W10right6 = W10 unsigned>> 6
+      r3Sigma0 ^= r339
+  r2 += maj2
+    ch1 = r0
+    ch1 ^= r7
+
+            1x,0 W10sigma1 ^= W10right6
+      r2 += r3Sigma0
+      r6Sigma1 = r6>>>14
+    ch1 &= r6
+
+            4x X12 = W10sigma1 + X12
+      r618 = r6>>>18
+      r641 = r6>>>41
+    ch1 ^= r0
+
+            2x,0 W12right19 = X12 unsigned>> 19
+      r6Sigma1 ^= r618
+      r2Sigma0 = r2>>>28
+    r1 += ch1
+
+            2x,0 W12left45 = X12 << 45
+      r6Sigma1 ^= r641
+      r234 = r2>>>34
+  maj0 = r3
+  maj0 ^= r2
+
+            2x,0 W12right61 = X12 unsigned>> 61
+            1x,0 W12sigma1 = W12right19 ^ W12left45
+      r2Sigma0 ^= r234
+      r239 = r2>>>39
+
+            2x,0 W12left3 = X12 << 3
+            1x,0 W12sigma1 ^= W12right61
+      r2Sigma0 ^= r239
+      r1 += r6Sigma1
+
+            2x,0 W12right6 = X12 unsigned>> 6
+            1x,0 W12sigma1 ^= W12left3
+  r5 += r1
+  r2andr3 = r3
+  r2andr3 &= r2
+
+            1x,0 W12sigma1 ^= W12right6
+      r5Sigma1 = r5>>>14
+      r1 += r2Sigma0
+  maj1 = r4
+  maj1 &= maj0
+
+            W12sigma1 = W12sigma1[1],W12sigma1[0]
+  maj1 ^= r2andr3
+    ch0 = r7
+    ch0 ^= r6
+
+  r1 += maj1
+    ch0 &= r5
+      r518 = r5>>>18
+
+  maj0 &= r1
+    ch0 ^= r7
+    r0 += wc4567[3]
+      r5Sigma1 ^= r518
+
+      r541 = r5>>>41
+            4x X12 = X12 + W12sigma1
+            mem256[&w + 96] = X12
+    r0 += ch0
+  maj0 ^= r2andr3
+
+      r1Sigma0 = r1>>>28
+            4x D12 = X12 + mem256[constants + 96]
+            wc12131415 = D12
+      r134 = r1>>>34
+      r5Sigma1 ^= r541
+
+      r0 += r5Sigma1
+      r1Sigma0 ^= r134
+      r139 = r1>>>39
+
+  r4 += r0
+  r0 += maj0
+      r1Sigma0 ^= r139
+
+      r0 += r1Sigma0
@@ -0,0 +1,166 @@
+            X5 = mem256[&w + 40]
+            4x X5right1 = X5 unsigned>> 1
+      r0Sigma1 = r0>>>14
+    r3 += wc12131415[0]
+    ch3 = r2
+    ch3 ^= r1
+
+            4x X5left63 = X5 << 63
+      r018 = r0>>>18
+    ch3 &= r0
+  maj2 = r5
+  maj2 ^= r4
+
+            X5sigma0 = X5right1 ^ X5left63
+      r041 = r0>>>41
+      r0Sigma1 ^= r018
+    ch3 ^= r2
+
+            4x X5right8 = X5 unsigned>> 8
+      r0Sigma1 ^= r041
+      r4Sigma0 = r4>>>28
+    r3 += ch3
+
+            X5sigma0 = X5sigma0 ^ X5right8
+      r434 = r4>>>34
+      r3 += r0Sigma1
+  maj3 = r6
+  maj3 &= maj2
+
+            W2 = mem128[&w + 16],0
+            2x,0 W2right19 = W2 unsigned>> 19
+      r4Sigma0 ^= r434
+      r439 = r4>>>39
+  r7 += r3
+
+            4x X5left56 = X5 << 56
+      r4Sigma0 ^= r439
+    r2 += wc12131415[1]
+  r4andr5 = r5
+  r4andr5 &= r4
+
+            2x,0 W2left45 = W2 << 45
+      r3 += r4Sigma0
+  maj3 ^= r4andr5
+    ch2 = r1
+    ch2 ^= r0
+
+            X5sigma0 = X5sigma0 ^ X5left56
+            2x,0 W2right61 = W2 unsigned>> 61
+      r7Sigma1 = r7>>>14
+  r3 += maj3
+
+            4x X5right7 = X5 unsigned>> 7
+            1x,0 W2sigma1 = W2right19 ^ W2left45
+      r718 = r7>>>18
+    ch2 &= r7
+
+            X5sigma0 = X5sigma0 ^ X5right7
+      r7Sigma1 ^= r718
+      r741 = r7>>>41
+  maj2 &= r3
+
+            1x,0 W2sigma1 ^= W2right61
+            4x X4 = X4 + mem256[&w + 104]
+      r7Sigma1 ^= r741
+  maj2 ^= r4andr5
+
+            2x,0 W2left3 = W2 << 3
+      r3Sigma0 = r3>>>28
+    ch2 ^= r1
+      r2 += r7Sigma1
+
+            4x X4 = X4 + X5sigma0
+      r334 = r3>>>34
+    r1 += wc12131415[2]
+    r2 += ch2
+
+            1x,0 W2sigma1 ^= W2left3
+      r3Sigma0 ^= r334
+      r339 = r3>>>39
+  r6 += r2
+
+            2x,0 W2right6 = W2 unsigned>> 6
+      r3Sigma0 ^= r339
+  r2 += maj2
+    ch1 = r0
+    ch1 ^= r7
+
+            1x,0 W2sigma1 ^= W2right6
+      r2 += r3Sigma0
+      r6Sigma1 = r6>>>14
+    ch1 &= r6
+
+            4x X4 = W2sigma1 + X4
+      r618 = r6>>>18
+      r641 = r6>>>41
+    ch1 ^= r0
+
+            2x,0 W4right19 = X4 unsigned>> 19
+      r6Sigma1 ^= r618
+      r2Sigma0 = r2>>>28
+    r1 += ch1
+
+            2x,0 W4left45 = X4 << 45
+      r6Sigma1 ^= r641
+      r234 = r2>>>34
+  maj0 = r3
+  maj0 ^= r2
+
+            2x,0 W4right61 = X4 unsigned>> 61
+            1x,0 W4sigma1 = W4right19 ^ W4left45
+      r2Sigma0 ^= r234
+      r239 = r2>>>39
+
+            2x,0 W4left3 = X4 << 3
+            1x,0 W4sigma1 ^= W4right61
+      r2Sigma0 ^= r239
+      r1 += r6Sigma1
+
+            2x,0 W4right6 = X4 unsigned>> 6
+            1x,0 W4sigma1 ^= W4left3
+  r5 += r1
+  r2andr3 = r3
+  r2andr3 &= r2
+
+            1x,0 W4sigma1 ^= W4right6
+      r5Sigma1 = r5>>>14
+      r1 += r2Sigma0
+  maj1 = r4
+  maj1 &= maj0
+
+            W4sigma1 = W4sigma1[1],W4sigma1[0]
+  maj1 ^= r2andr3
+    ch0 = r7
+    ch0 ^= r6
+
+  r1 += maj1
+    ch0 &= r5
+      r518 = r5>>>18
+
+  maj0 &= r1
+    ch0 ^= r7
+    r0 += wc12131415[3]
+      r5Sigma1 ^= r518
+
+      r541 = r5>>>41
+            4x X4 = X4 + W4sigma1
+            mem256[&w + 32] = X4
+    r0 += ch0
+  maj0 ^= r2andr3
+
+      r1Sigma0 = r1>>>28
+            4x D4 = X4 + mem256[constants + 32]
+            wc4567 = D4
+      r134 = r1>>>34
+      r5Sigma1 ^= r541
+
+      r0 += r5Sigma1
+      r1Sigma0 ^= r134
+      r139 = r1>>>39
+
+  r4 += r0
+  r0 += maj0
+      r1Sigma0 ^= r139
+
+      r0 += r1Sigma0
@@ -0,0 +1,166 @@
+            X9 = mem256[&w + 72]
+            4x X9right1 = X9 unsigned>> 1
+      r4Sigma1 = r4>>>14
+    r7 += wc0123[0]
+    ch7 = r6
+    ch7 ^= r5
+
+            4x X9left63 = X9 << 63
+      r418 = r4>>>18
+    ch7 &= r4
+  maj6 = r1
+  maj6 ^= r0
+
+            X9sigma0 = X9right1 ^ X9left63
+      r441 = r4>>>41
+      r4Sigma1 ^= r418
+    ch7 ^= r6
+
+            4x X9right8 = X9 unsigned>> 8
+      r4Sigma1 ^= r441
+      r0Sigma0 = r0>>>28
+    r7 += ch7
+
+            X9sigma0 = X9sigma0 ^ X9right8
+      r034 = r0>>>34
+      r7 += r4Sigma1
+  maj7 = r2
+  maj7 &= maj6
+
+            W6 = mem128[&w + 48],0
+            2x,0 W6right19 = W6 unsigned>> 19
+      r0Sigma0 ^= r034
+      r039 = r0>>>39
+  r3 += r7
+
+            4x X9left56 = X9 << 56
+      r0Sigma0 ^= r039
+    r6 += wc0123[1]
+  r0andr1 = r1
+  r0andr1 &= r0
+
+            2x,0 W6left45 = W6 << 45
+      r7 += r0Sigma0
+  maj7 ^= r0andr1
+    ch6 = r5
+    ch6 ^= r4
+
+            X9sigma0 = X9sigma0 ^ X9left56
+            2x,0 W6right61 = W6 unsigned>> 61
+      r3Sigma1 = r3>>>14
+  r7 += maj7
+
+            4x X9right7 = X9 unsigned>> 7
+            1x,0 W6sigma1 = W6right19 ^ W6left45
+      r318 = r3>>>18
+    ch6 &= r3
+
+            X9sigma0 = X9sigma0 ^ X9right7
+      r3Sigma1 ^= r318
+      r341 = r3>>>41
+  maj6 &= r7
+
+            1x,0 W6sigma1 ^= W6right61
+            4x X8 = X8 + mem256[&w + 8]
+      r3Sigma1 ^= r341
+  maj6 ^= r0andr1
+
+            2x,0 W6left3 = W6 << 3
+      r7Sigma0 = r7>>>28
+    ch6 ^= r5
+      r6 += r3Sigma1
+
+            4x X8 = X8 + X9sigma0
+      r734 = r7>>>34
+    r5 += wc0123[2]
+    r6 += ch6
+
+            1x,0 W6sigma1 ^= W6left3
+      r7Sigma0 ^= r734
+      r739 = r7>>>39
+  r2 += r6
+
+            2x,0 W6right6 = W6 unsigned>> 6
+      r7Sigma0 ^= r739
+  r6 += maj6
+    ch5 = r4
+    ch5 ^= r3
+
+            1x,0 W6sigma1 ^= W6right6
+      r6 += r7Sigma0
+      r2Sigma1 = r2>>>14
+    ch5 &= r2
+
+            4x X8 = W6sigma1 + X8
+      r218 = r2>>>18
+      r241 = r2>>>41
+    ch5 ^= r4
+
+            2x,0 W8right19 = X8 unsigned>> 19
+      r2Sigma1 ^= r218
+      r6Sigma0 = r6>>>28
+    r5 += ch5
+
+            2x,0 W8left45 = X8 << 45
+      r2Sigma1 ^= r241
+      r634 = r6>>>34
+  maj4 = r7
+  maj4 ^= r6
+
+            2x,0 W8right61 = X8 unsigned>> 61
+            1x,0 W8sigma1 = W8right19 ^ W8left45
+      r6Sigma0 ^= r634
+      r639 = r6>>>39
+
+            2x,0 W8left3 = X8 << 3
+            1x,0 W8sigma1 ^= W8right61
+      r6Sigma0 ^= r639
+      r5 += r2Sigma1
+
+            2x,0 W8right6 = X8 unsigned>> 6
+            1x,0 W8sigma1 ^= W8left3
+  r1 += r5
+  r6andr7 = r7
+  r6andr7 &= r6
+
+            1x,0 W8sigma1 ^= W8right6
+      r1Sigma1 = r1>>>14
+      r5 += r6Sigma0
+  maj5 = r0
+  maj5 &= maj4
+
+            W8sigma1 = W8sigma1[1],W8sigma1[0]
+  maj5 ^= r6andr7
+    ch4 = r3
+    ch4 ^= r2
+
+  r5 += maj5
+    ch4 &= r1
+      r118 = r1>>>18
+
+  maj4 &= r5
+    ch4 ^= r3
+    r4 += wc0123[3]
+      r1Sigma1 ^= r118
+
+      r141 = r1>>>41
+            4x X8 = X8 + W8sigma1
+            mem256[&w + 64] = X8
+    r4 += ch4
+  maj4 ^= r6andr7
+
+      r5Sigma0 = r5>>>28
+            4x D8 = X8 + mem256[constants + 64]
+            wc891011 = D8
+      r534 = r5>>>34
+      r1Sigma1 ^= r141
+
+      r4 += r1Sigma1
+      r5Sigma0 ^= r534
+      r539 = r5>>>39
+
+  r0 += r4
+  r4 += maj4
+      r5Sigma0 ^= r539
+
+      r4 += r5Sigma0
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+
+import sys
+
+i = 0
+
+for doubleround in range(8):
+  i0 = (i+0)&7
+  i1 = (i+1)&7
+  i2 = (i+2)&7
+  i3 = (i+3)&7
+  i4 = (i+4)&7
+  i5 = (i+5)&7
+  i6 = (i+6)&7
+  i7 = (i+7)&7
+
+  round = 2*doubleround
+  round4 = round&~3
+  loadarray = 'wc%d%d%d%d' % (round4,round4+1,round4+2,round4+3)
+  i0load = '%s[%d]' % (loadarray,(round+0)&3)
+  i1load = '%s[%d]' % (loadarray,(round+1)&3)
+
+  i -= 2
+
+  if len(sys.argv) > 1:
+    if sys.argv[1] != str(doubleround):
+      continue
+
+
+  print('    r%d += %s' % (i7,i0load))
+  print('      r%dSigma1 = r%d>>>14' % (i4,i4))
+  print('    ch%d = r%d' % (i7,i6))
+  print('      r%d18 = r%d>>>18' % (i4,i4))
+  print('    ch%d ^= r%d' % (i7,i5))
+
+  print('      r%d41 = r%d>>>41' % (i4,i4))
+  print('      r%dSigma1 ^= r%d18' % (i4,i4))
+  print('    ch%d &= r%d' % (i7,i4))
+  print('      r%dSigma0 = r%d>>>28' % (i0,i0))
+
+  print('      r%dSigma1 ^= r%d41' % (i4,i4))
+  print('    ch%d ^= r%d' % (i7,i6))
+  print('      r%d34 = r%d>>>34' % (i0,i0))
+  print('  maj%d = r%d' % (i6,i1))
+  print('  maj%d ^= r%d' % (i6,i0))
+
+  print('      r%dSigma0 ^= r%d34' % (i0,i0))
+  print('    r%d += ch%d' % (i7,i7))
+  print('  r%dandr%d = r%d' % (i0,i1,i1))
+  print('      r%d39 = r%d>>>39' % (i0,i0))
+  print('  r%dandr%d &= r%d' % (i0,i1,i0))
+
+  print('      r%dSigma0 ^= r%d39' % (i0,i0))
+  print('      r%d += r%dSigma1' % (i7,i4))
+  print('  maj%d = r%d' % (i7,i2))
+  print('            r%d += %s' % (i6,i1load))
+  print('  maj%d &= maj%d' % (i7,i6))
+
+  print('  r%d += r%d' % (i3,i7))
+  print('      r%d += r%dSigma0' % (i7,i0))
+  print('            ch%d = r%d' % (i6,i5))
+  print('  maj%d ^= r%dandr%d' % (i7,i0,i1))
+  print('            ch%d ^= r%d' % (i6,i4))
+
+  print('          r%dSigma1 = r%d>>>14' % (i3,i3))
+  print('  r%d += maj%d' % (i7,i7))
+  print('            ch%d &= r%d' % (i6,i3))
+  print('              r%d18 = r%d>>>18' % (i3,i3))
+
+  print('              r%dSigma1 ^= r%d18' % (i3,i3))
+  print('          maj%d &= r%d' % (i6,i7))
+  print('            ch%d ^= r%d' % (i6,i5))
+  print('              r%d41 = r%d>>>41' % (i3,i3))
+
+  print('              r%dSigma1 ^= r%d41' % (i3,i3))
+  print('              r%dSigma0 = r%d>>>28' % (i7,i7))
+  print('          maj%d ^= r%dandr%d' % (i6,i0,i1))
+  print('            r%d += ch%d' % (i6,i6))
+
+  print('              r%d += r%dSigma1' % (i6,i3))
+  print('              r%d34 = r%d>>>34' % (i7,i7))
+
+  print('              r%dSigma0 ^= r%d34' % (i7,i7))
+  print('          r%d += r%d' % (i2,i6))
+  print('          r%d += maj%d' % (i6,i6))
+  print('              r%d39 = r%d>>>39' % (i7,i7))
+
+  print('              r%dSigma0 ^= r%d39' % (i7,i7))
+
+  print('              r%d += r%dSigma0' % (i6,i7))
@@ -0,0 +1,49 @@
+    r7 += wc0123[0]
+      r4Sigma1 = r4>>>14
+    ch7 = r6
+      r418 = r4>>>18
+    ch7 ^= r5
+      r441 = r4>>>41
+      r4Sigma1 ^= r418
+    ch7 &= r4
+      r0Sigma0 = r0>>>28
+      r4Sigma1 ^= r441
+    ch7 ^= r6
+      r034 = r0>>>34
+  maj6 = r1
+  maj6 ^= r0
+      r0Sigma0 ^= r034
+    r7 += ch7
+  r0andr1 = r1
+      r039 = r0>>>39
+  r0andr1 &= r0
+      r0Sigma0 ^= r039
+      r7 += r4Sigma1
+  maj7 = r2
+            r6 += wc0123[1]
+  maj7 &= maj6
+  r3 += r7
+      r7 += r0Sigma0
+            ch6 = r5
+  maj7 ^= r0andr1
+            ch6 ^= r4
+          r3Sigma1 = r3>>>14
+  r7 += maj7
+            ch6 &= r3
+              r318 = r3>>>18
+              r3Sigma1 ^= r318
+          maj6 &= r7
+            ch6 ^= r5
+              r341 = r3>>>41
+              r3Sigma1 ^= r341
+              r7Sigma0 = r7>>>28
+          maj6 ^= r0andr1
+            r6 += ch6
+              r6 += r3Sigma1
+              r734 = r7>>>34
+              r7Sigma0 ^= r734
+          r2 += r6
+          r6 += maj6
+              r739 = r7>>>39
+              r7Sigma0 ^= r739
+              r6 += r7Sigma0
@@ -0,0 +1,49 @@
+    r5 += wc891011[2]
+      r2Sigma1 = r2>>>14
+    ch5 = r4
+      r218 = r2>>>18
+    ch5 ^= r3
+      r241 = r2>>>41
+      r2Sigma1 ^= r218
+    ch5 &= r2
+      r6Sigma0 = r6>>>28
+      r2Sigma1 ^= r241
+    ch5 ^= r4
+      r634 = r6>>>34
+  maj4 = r7
+  maj4 ^= r6
+      r6Sigma0 ^= r634
+    r5 += ch5
+  r6andr7 = r7
+      r639 = r6>>>39
+  r6andr7 &= r6
+      r6Sigma0 ^= r639
+      r5 += r2Sigma1
+  maj5 = r0
+            r4 += wc891011[3]
+  maj5 &= maj4
+  r1 += r5
+      r5 += r6Sigma0
+            ch4 = r3
+  maj5 ^= r6andr7
+            ch4 ^= r2
+          r1Sigma1 = r1>>>14
+  r5 += maj5
+            ch4 &= r1
+              r118 = r1>>>18
+              r1Sigma1 ^= r118
+          maj4 &= r5
+            ch4 ^= r3
+              r141 = r1>>>41
+              r1Sigma1 ^= r141
+              r5Sigma0 = r5>>>28
+          maj4 ^= r6andr7
+            r4 += ch4
+              r4 += r1Sigma1
+              r534 = r5>>>34
+              r5Sigma0 ^= r534
+          r0 += r4
+          r4 += maj4
+              r539 = r5>>>39
+              r5Sigma0 ^= r539
+              r4 += r5Sigma0
@@ -0,0 +1,49 @@
+    r3 += wc12131415[0]
+      r0Sigma1 = r0>>>14
+    ch3 = r2
+      r018 = r0>>>18
+    ch3 ^= r1
+      r041 = r0>>>41
+      r0Sigma1 ^= r018
+    ch3 &= r0
+      r4Sigma0 = r4>>>28
+      r0Sigma1 ^= r041
+    ch3 ^= r2
+      r434 = r4>>>34
+  maj2 = r5
+  maj2 ^= r4
+      r4Sigma0 ^= r434
+    r3 += ch3
+  r4andr5 = r5
+      r439 = r4>>>39
+  r4andr5 &= r4
+      r4Sigma0 ^= r439
+      r3 += r0Sigma1
+  maj3 = r6
+            r2 += wc12131415[1]
+  maj3 &= maj2
+  r7 += r3
+      r3 += r4Sigma0
+            ch2 = r1
+  maj3 ^= r4andr5
+            ch2 ^= r0
+          r7Sigma1 = r7>>>14
+  r3 += maj3
+            ch2 &= r7
+              r718 = r7>>>18
+              r7Sigma1 ^= r718
+          maj2 &= r3
+            ch2 ^= r1
+              r741 = r7>>>41
+              r7Sigma1 ^= r741
+              r3Sigma0 = r3>>>28
+          maj2 ^= r4andr5
+            r2 += ch2
+              r2 += r7Sigma1
+              r334 = r3>>>34
+              r3Sigma0 ^= r334
+          r6 += r2
+          r2 += maj2
+              r339 = r3>>>39
+              r3Sigma0 ^= r339
+              r2 += r3Sigma0
@@ -0,0 +1,49 @@
+    r1 += wc12131415[2]
+      r6Sigma1 = r6>>>14
+    ch1 = r0
+      r618 = r6>>>18
+    ch1 ^= r7
+      r641 = r6>>>41
+      r6Sigma1 ^= r618
+    ch1 &= r6
+      r2Sigma0 = r2>>>28
+      r6Sigma1 ^= r641
+    ch1 ^= r0
+      r234 = r2>>>34
+  maj0 = r3
+  maj0 ^= r2
+      r2Sigma0 ^= r234
+    r1 += ch1
+  r2andr3 = r3
+      r239 = r2>>>39
+  r2andr3 &= r2
+      r2Sigma0 ^= r239
+      r1 += r6Sigma1
+  maj1 = r4
+            r0 += wc12131415[3]
+  maj1 &= maj0
+  r5 += r1
+      r1 += r2Sigma0
+            ch0 = r7
+  maj1 ^= r2andr3
+            ch0 ^= r6
+          r5Sigma1 = r5>>>14
+  r1 += maj1
+            ch0 &= r5
+              r518 = r5>>>18
+              r5Sigma1 ^= r518
+          maj0 &= r1
+            ch0 ^= r7
+              r541 = r5>>>41
+              r5Sigma1 ^= r541
+              r1Sigma0 = r1>>>28
+          maj0 ^= r2andr3
+            r0 += ch0
+              r0 += r5Sigma1
+              r134 = r1>>>34
+              r1Sigma0 ^= r134
+          r4 += r0
+          r0 += maj0
+              r139 = r1>>>39
+              r1Sigma0 ^= r139
+              r0 += r1Sigma0
@@ -0,0 +1,49 @@
+    r5 += wc0123[2]
+      r2Sigma1 = r2>>>14
+    ch5 = r4
+      r218 = r2>>>18
+    ch5 ^= r3
+      r241 = r2>>>41
+      r2Sigma1 ^= r218
+    ch5 &= r2
+      r6Sigma0 = r6>>>28
+      r2Sigma1 ^= r241
+    ch5 ^= r4
+      r634 = r6>>>34
+  maj4 = r7
+  maj4 ^= r6
+      r6Sigma0 ^= r634
+    r5 += ch5
+  r6andr7 = r7
+      r639 = r6>>>39
+  r6andr7 &= r6
+      r6Sigma0 ^= r639
+      r5 += r2Sigma1
+  maj5 = r0
+            r4 += wc0123[3]
+  maj5 &= maj4
+  r1 += r5
+      r5 += r6Sigma0
+            ch4 = r3
+  maj5 ^= r6andr7
+            ch4 ^= r2
+          r1Sigma1 = r1>>>14
+  r5 += maj5
+            ch4 &= r1
+              r118 = r1>>>18
+              r1Sigma1 ^= r118
+          maj4 &= r5
+            ch4 ^= r3
+              r141 = r1>>>41
+              r1Sigma1 ^= r141
+              r5Sigma0 = r5>>>28
+          maj4 ^= r6andr7
+            r4 += ch4
+              r4 += r1Sigma1
+              r534 = r5>>>34
+              r5Sigma0 ^= r534
+          r0 += r4
+          r4 += maj4
+              r539 = r5>>>39
+              r5Sigma0 ^= r539
+              r4 += r5Sigma0
@@ -0,0 +1,49 @@
+    r3 += wc4567[0]
+      r0Sigma1 = r0>>>14
+    ch3 = r2
+      r018 = r0>>>18
+    ch3 ^= r1
+      r041 = r0>>>41
+      r0Sigma1 ^= r018
+    ch3 &= r0
+      r4Sigma0 = r4>>>28
+      r0Sigma1 ^= r041
+    ch3 ^= r2
+      r434 = r4>>>34
+  maj2 = r5
+  maj2 ^= r4
+      r4Sigma0 ^= r434
+    r3 += ch3
+  r4andr5 = r5
+      r439 = r4>>>39
+  r4andr5 &= r4
+      r4Sigma0 ^= r439
+      r3 += r0Sigma1
+  maj3 = r6
+            r2 += wc4567[1]
+  maj3 &= maj2
+  r7 += r3
+      r3 += r4Sigma0
+            ch2 = r1
+  maj3 ^= r4andr5
+            ch2 ^= r0
+          r7Sigma1 = r7>>>14
+  r3 += maj3
+            ch2 &= r7
+              r718 = r7>>>18
+              r7Sigma1 ^= r718
+          maj2 &= r3
+            ch2 ^= r1
+              r741 = r7>>>41
+              r7Sigma1 ^= r741
+              r3Sigma0 = r3>>>28
+          maj2 ^= r4andr5
+            r2 += ch2
+              r2 += r7Sigma1
+              r334 = r3>>>34
+              r3Sigma0 ^= r334
+          r6 += r2
+          r2 += maj2
+              r339 = r3>>>39
+              r3Sigma0 ^= r339
+              r2 += r3Sigma0
@@ -0,0 +1,49 @@
+    r1 += wc4567[2]
+      r6Sigma1 = r6>>>14
+    ch1 = r0
+      r618 = r6>>>18
+    ch1 ^= r7
+      r641 = r6>>>41
+      r6Sigma1 ^= r618
+    ch1 &= r6
+      r2Sigma0 = r2>>>28
+      r6Sigma1 ^= r641
+    ch1 ^= r0
+      r234 = r2>>>34
+  maj0 = r3
+  maj0 ^= r2
+      r2Sigma0 ^= r234
+    r1 += ch1
+  r2andr3 = r3
+      r239 = r2>>>39
+  r2andr3 &= r2
+      r2Sigma0 ^= r239
+      r1 += r6Sigma1
+  maj1 = r4
+            r0 += wc4567[3]
+  maj1 &= maj0
+  r5 += r1
+      r1 += r2Sigma0
+            ch0 = r7
+  maj1 ^= r2andr3
+            ch0 ^= r6
+          r5Sigma1 = r5>>>14
+  r1 += maj1
+            ch0 &= r5
+              r518 = r5>>>18
+              r5Sigma1 ^= r518
+          maj0 &= r1
+            ch0 ^= r7
+              r541 = r5>>>41
+              r5Sigma1 ^= r541
+              r1Sigma0 = r1>>>28
+          maj0 ^= r2andr3
+            r0 += ch0
+              r0 += r5Sigma1
+              r134 = r1>>>34
+              r1Sigma0 ^= r134
+          r4 += r0
+          r0 += maj0
+              r139 = r1>>>39
+              r1Sigma0 ^= r139
+              r0 += r1Sigma0
@@ -0,0 +1,49 @@
+    r7 += wc891011[0]
+      r4Sigma1 = r4>>>14
+    ch7 = r6
+      r418 = r4>>>18
+    ch7 ^= r5
+      r441 = r4>>>41
+      r4Sigma1 ^= r418
+    ch7 &= r4
+      r0Sigma0 = r0>>>28
+      r4Sigma1 ^= r441
+    ch7 ^= r6
+      r034 = r0>>>34
+  maj6 = r1
+  maj6 ^= r0
+      r0Sigma0 ^= r034
+    r7 += ch7
+  r0andr1 = r1
+      r039 = r0>>>39
+  r0andr1 &= r0
+      r0Sigma0 ^= r039
+      r7 += r4Sigma1
+  maj7 = r2
+            r6 += wc891011[1]
+  maj7 &= maj6
+  r3 += r7
+      r7 += r0Sigma0
+            ch6 = r5
+  maj7 ^= r0andr1
+            ch6 ^= r4
+          r3Sigma1 = r3>>>14
+  r7 += maj7
+            ch6 &= r3
+              r318 = r3>>>18
+              r3Sigma1 ^= r318
+          maj6 &= r7
+            ch6 ^= r5
+              r341 = r3>>>41
+              r3Sigma1 ^= r341
+              r7Sigma0 = r7>>>28
+          maj6 ^= r0andr1
+            r6 += ch6
+              r6 += r3Sigma1
+              r734 = r7>>>34
+              r7Sigma0 ^= r734
+          r2 += r6
+          r6 += maj6
+              r739 = r7>>>39
+              r7Sigma0 ^= r739
+              r6 += r7Sigma0
@@ -0,0 +1 @@
+../m3/api.h
@@ -0,0 +1,218 @@
+#include "crypto_hashblocks.h"
+
+typedef unsigned long long uint64;
+
+static uint64 load_bigendian(const unsigned char *x)
+{
+  return
+      (uint64) (x[7]) \
+  | (((uint64) (x[6])) << 8) \
+  | (((uint64) (x[5])) << 16) \
+  | (((uint64) (x[4])) << 24) \
+  | (((uint64) (x[3])) << 32) \
+  | (((uint64) (x[2])) << 40) \
+  | (((uint64) (x[1])) << 48) \
+  | (((uint64) (x[0])) << 56)
+  ;
+}
+
+static void store_bigendian(unsigned char *x,uint64 u)
+{
+  x[7] = u; u >>= 8;
+  x[6] = u; u >>= 8;
+  x[5] = u; u >>= 8;
+  x[4] = u; u >>= 8;
+  x[3] = u; u >>= 8;
+  x[2] = u; u >>= 8;
+  x[1] = u; u >>= 8;
+  x[0] = u;
+}
+
+#define SHR(x,c) ((x) >> (c))
+#define ROTR(x,c) (((x) >> (c)) | ((x) << (64 - (c))))
+
+#define sigma0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x,7))
+#define sigma1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x,6))
+
+#define M(w0,w14,w9,w1) w0 = sigma1(w14) + w9 + sigma0(w1) + w0;
+
+static void expand(uint64 *w)
+{
+  M(w[0] ,w[14],w[9] ,w[1] )
+  M(w[1] ,w[15],w[10],w[2] )
+  M(w[2] ,w[0] ,w[11],w[3] )
+  M(w[3] ,w[1] ,w[12],w[4] )
+  M(w[4] ,w[2] ,w[13],w[5] )
+  M(w[5] ,w[3] ,w[14],w[6] )
+  M(w[6] ,w[4] ,w[15],w[7] )
+  M(w[7] ,w[5] ,w[0] ,w[8] )
+  M(w[8] ,w[6] ,w[1] ,w[9] )
+  M(w[9] ,w[7] ,w[2] ,w[10])
+  M(w[10],w[8] ,w[3] ,w[11])
+  M(w[11],w[9] ,w[4] ,w[12])
+  M(w[12],w[10],w[5] ,w[13])
+  M(w[13],w[11],w[6] ,w[14])
+  M(w[14],w[12],w[7] ,w[15])
+  M(w[15],w[13],w[8] ,w[0] )
+}
+
+#define Ch(x,y,z) (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) ((x & (y ^ z)) ^ (y & z))
+#define Sigma0(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
+#define Sigma1(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
+
+#define F(r0,r1,r2,r3,r4,r5,r6,r7,w,k) \
+  r7 += Sigma1(r4) + Ch(r4,r5,r6) + k + w; \
+  r3 += r7; \
+  r7 += Sigma0(r0) + Maj(r0,r1,r2);
+
+static void handle(uint64 *r,const uint64 *w,const uint64 *c)
+{
+  F(r[0],r[1],r[2],r[3],r[4],r[5],r[6],r[7],w[0] ,c[0])
+  F(r[7],r[0],r[1],r[2],r[3],r[4],r[5],r[6],w[1] ,c[1])
+  F(r[6],r[7],r[0],r[1],r[2],r[3],r[4],r[5],w[2] ,c[2])
+  F(r[5],r[6],r[7],r[0],r[1],r[2],r[3],r[4],w[3] ,c[3])
+  F(r[4],r[5],r[6],r[7],r[0],r[1],r[2],r[3],w[4] ,c[4])
+  F(r[3],r[4],r[5],r[6],r[7],r[0],r[1],r[2],w[5] ,c[5])
+  F(r[2],r[3],r[4],r[5],r[6],r[7],r[0],r[1],w[6] ,c[6])
+  F(r[1],r[2],r[3],r[4],r[5],r[6],r[7],r[0],w[7] ,c[7])
+  F(r[0],r[1],r[2],r[3],r[4],r[5],r[6],r[7],w[8] ,c[8])
+  F(r[7],r[0],r[1],r[2],r[3],r[4],r[5],r[6],w[9] ,c[9])
+  F(r[6],r[7],r[0],r[1],r[2],r[3],r[4],r[5],w[10],c[10])
+  F(r[5],r[6],r[7],r[0],r[1],r[2],r[3],r[4],w[11],c[11])
+  F(r[4],r[5],r[6],r[7],r[0],r[1],r[2],r[3],w[12],c[12])
+  F(r[3],r[4],r[5],r[6],r[7],r[0],r[1],r[2],w[13],c[13])
+  F(r[2],r[3],r[4],r[5],r[6],r[7],r[0],r[1],w[14],c[14])
+  F(r[1],r[2],r[3],r[4],r[5],r[6],r[7],r[0],w[15],c[15])
+}
+
+static const uint64 round[80] = {
+  0x428a2f98d728ae22ULL
+, 0x7137449123ef65cdULL
+, 0xb5c0fbcfec4d3b2fULL
+, 0xe9b5dba58189dbbcULL
+, 0x3956c25bf348b538ULL
+, 0x59f111f1b605d019ULL
+, 0x923f82a4af194f9bULL
+, 0xab1c5ed5da6d8118ULL
+, 0xd807aa98a3030242ULL
+, 0x12835b0145706fbeULL
+, 0x243185be4ee4b28cULL
+, 0x550c7dc3d5ffb4e2ULL
+, 0x72be5d74f27b896fULL
+, 0x80deb1fe3b1696b1ULL
+, 0x9bdc06a725c71235ULL
+, 0xc19bf174cf692694ULL
+, 0xe49b69c19ef14ad2ULL
+, 0xefbe4786384f25e3ULL
+, 0x0fc19dc68b8cd5b5ULL
+, 0x240ca1cc77ac9c65ULL
+, 0x2de92c6f592b0275ULL
+, 0x4a7484aa6ea6e483ULL
+, 0x5cb0a9dcbd41fbd4ULL
+, 0x76f988da831153b5ULL
+, 0x983e5152ee66dfabULL
+, 0xa831c66d2db43210ULL
+, 0xb00327c898fb213fULL
+, 0xbf597fc7beef0ee4ULL
+, 0xc6e00bf33da88fc2ULL
+, 0xd5a79147930aa725ULL
+, 0x06ca6351e003826fULL
+, 0x142929670a0e6e70ULL
+, 0x27b70a8546d22ffcULL
+, 0x2e1b21385c26c926ULL
+, 0x4d2c6dfc5ac42aedULL
+, 0x53380d139d95b3dfULL
+, 0x650a73548baf63deULL
+, 0x766a0abb3c77b2a8ULL
+, 0x81c2c92e47edaee6ULL
+, 0x92722c851482353bULL
+, 0xa2bfe8a14cf10364ULL
+, 0xa81a664bbc423001ULL
+, 0xc24b8b70d0f89791ULL
+, 0xc76c51a30654be30ULL
+, 0xd192e819d6ef5218ULL
+, 0xd69906245565a910ULL
+, 0xf40e35855771202aULL
+, 0x106aa07032bbd1b8ULL
+, 0x19a4c116b8d2d0c8ULL
+, 0x1e376c085141ab53ULL
+, 0x2748774cdf8eeb99ULL
+, 0x34b0bcb5e19b48a8ULL
+, 0x391c0cb3c5c95a63ULL
+, 0x4ed8aa4ae3418acbULL
+, 0x5b9cca4f7763e373ULL
+, 0x682e6ff3d6b2b8a3ULL
+, 0x748f82ee5defb2fcULL
+, 0x78a5636f43172f60ULL
+, 0x84c87814a1f0ab72ULL
+, 0x8cc702081a6439ecULL
+, 0x90befffa23631e28ULL
+, 0xa4506cebde82bde9ULL
+, 0xbef9a3f7b2c67915ULL
+, 0xc67178f2e372532bULL
+, 0xca273eceea26619cULL
+, 0xd186b8c721c0c207ULL
+, 0xeada7dd6cde0eb1eULL
+, 0xf57d4f7fee6ed178ULL
+, 0x06f067aa72176fbaULL
+, 0x0a637dc5a2c898a6ULL
+, 0x113f9804bef90daeULL
+, 0x1b710b35131c471bULL
+, 0x28db77f523047d84ULL
+, 0x32caab7b40c72493ULL
+, 0x3c9ebe0a15c9bebcULL
+, 0x431d67c49c100d4cULL
+, 0x4cc5d4becb3e42b6ULL
+, 0x597f299cfc657e2aULL
+, 0x5fcb6fab3ad6faecULL
+, 0x6c44198c4a475817ULL
+};
+
+int crypto_hashblocks(unsigned char *statebytes,const unsigned char *in,long long inlen)
+{
+  uint64 w[16];
+  uint64 state[8];
+  uint64 r[8];
+  int i;
+
+  for (i = 0;i < 8;++i)
+    state[i] = r[i] = load_bigendian(statebytes+8*i);
+
+  while (inlen >= 128) {
+    for (i = 0;i < 16;++i)
+      w[i] = load_bigendian(in+8*i);
+
+    handle(r,w,round+0);
+
+    expand(w);
+
+    handle(r,w,round+16);
+
+    expand(w);
+
+    handle(r,w,round+32);
+
+    expand(w);
+
+    handle(r,w,round+48);
+
+    expand(w);
+
+    handle(r,w,round+64);
+
+    for (i = 0;i < 8;++i) {
+      uint64 x = r[i]+state[i];
+      state[i] = x;
+      r[i] = x;
+    }
+      
+    in += 128;
+    inlen -= 128;
+  }
+
+  for (i = 0;i < 8;++i)
+    store_bigendian(statebytes+8*i,state[i]);
+
+  return inlen;
+}
@@ -0,0 +1 @@
+../m3/implementors
@@ -0,0 +1 @@
+../m3/api.h
@@ -0,0 +1,198 @@
+#include "crypto_hashblocks.h"
+
+typedef unsigned long long uint64;
+
+static uint64 load_bigendian(const unsigned char *x)
+{
+  return
+      (uint64) (x[7]) \
+  | (((uint64) (x[6])) << 8) \
+  | (((uint64) (x[5])) << 16) \
+  | (((uint64) (x[4])) << 24) \
+  | (((uint64) (x[3])) << 32) \
+  | (((uint64) (x[2])) << 40) \
+  | (((uint64) (x[1])) << 48) \
+  | (((uint64) (x[0])) << 56)
+  ;
+}
+
+static void store_bigendian(unsigned char *x,uint64 u)
+{
+  x[7] = u; u >>= 8;
+  x[6] = u; u >>= 8;
+  x[5] = u; u >>= 8;
+  x[4] = u; u >>= 8;
+  x[3] = u; u >>= 8;
+  x[2] = u; u >>= 8;
+  x[1] = u; u >>= 8;
+  x[0] = u;
+}
+
+#define SHR(x,c) ((x) >> (c))
+#define ROTR(x,c) (((x) >> (c)) | ((x) << (64 - (c))))
+#define sigma0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x,7))
+#define sigma1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x,6))
+
+static void expand(uint64 *w)
+{
+  int i;
+  for (i = 0;i < 16;++i) {
+    w[i] += w[15&(i+9)];
+    w[i] += sigma1(w[15&(i+14)]);
+    w[i] += sigma0(w[15&(i+1)]);
+  }
+}
+
+#define Ch(x,y,z) (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) ((x & (y ^ z)) ^ (y & z))
+#define Sigma0(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
+#define Sigma1(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
+
+static void handle(uint64 *r,const uint64 *w,const uint64 *c)
+{
+  int i;
+  uint64 x;
+
+  for (i = 0;i < 16;++i) {
+    x = r[7&(7-i)];
+    x += w[i];
+    x += c[i];
+    x += Sigma1(r[7&(4-i)]);
+    x += Ch(r[7&(4-i)],r[7&(5-i)],r[7&(6-i)]);
+    r[7&(3-i)] += x;
+    x += Sigma0(r[7&(0-i)]);
+    x += Maj(r[7&(0-i)],r[7&(1-i)],r[7&(2-i)]);
+    r[7&(7-i)] = x;
+  }
+}
+
+static const uint64 round[80] = {
+  0x428a2f98d728ae22ULL
+, 0x7137449123ef65cdULL
+, 0xb5c0fbcfec4d3b2fULL
+, 0xe9b5dba58189dbbcULL
+, 0x3956c25bf348b538ULL
+, 0x59f111f1b605d019ULL
+, 0x923f82a4af194f9bULL
+, 0xab1c5ed5da6d8118ULL
+, 0xd807aa98a3030242ULL
+, 0x12835b0145706fbeULL
+, 0x243185be4ee4b28cULL
+, 0x550c7dc3d5ffb4e2ULL
+, 0x72be5d74f27b896fULL
+, 0x80deb1fe3b1696b1ULL
+, 0x9bdc06a725c71235ULL
+, 0xc19bf174cf692694ULL
+, 0xe49b69c19ef14ad2ULL
+, 0xefbe4786384f25e3ULL
+, 0x0fc19dc68b8cd5b5ULL
+, 0x240ca1cc77ac9c65ULL
+, 0x2de92c6f592b0275ULL
+, 0x4a7484aa6ea6e483ULL
+, 0x5cb0a9dcbd41fbd4ULL
+, 0x76f988da831153b5ULL
+, 0x983e5152ee66dfabULL
+, 0xa831c66d2db43210ULL
+, 0xb00327c898fb213fULL
+, 0xbf597fc7beef0ee4ULL
+, 0xc6e00bf33da88fc2ULL
+, 0xd5a79147930aa725ULL
+, 0x06ca6351e003826fULL
+, 0x142929670a0e6e70ULL
+, 0x27b70a8546d22ffcULL
+, 0x2e1b21385c26c926ULL
+, 0x4d2c6dfc5ac42aedULL
+, 0x53380d139d95b3dfULL
+, 0x650a73548baf63deULL
+, 0x766a0abb3c77b2a8ULL
+, 0x81c2c92e47edaee6ULL
+, 0x92722c851482353bULL
+, 0xa2bfe8a14cf10364ULL
+, 0xa81a664bbc423001ULL
+, 0xc24b8b70d0f89791ULL
+, 0xc76c51a30654be30ULL
+, 0xd192e819d6ef5218ULL
+, 0xd69906245565a910ULL
+, 0xf40e35855771202aULL
+, 0x106aa07032bbd1b8ULL
+, 0x19a4c116b8d2d0c8ULL
+, 0x1e376c085141ab53ULL
+, 0x2748774cdf8eeb99ULL
+, 0x34b0bcb5e19b48a8ULL
+, 0x391c0cb3c5c95a63ULL
+, 0x4ed8aa4ae3418acbULL
+, 0x5b9cca4f7763e373ULL
+, 0x682e6ff3d6b2b8a3ULL
+, 0x748f82ee5defb2fcULL
+, 0x78a5636f43172f60ULL
+, 0x84c87814a1f0ab72ULL
+, 0x8cc702081a6439ecULL
+, 0x90befffa23631e28ULL
+, 0xa4506cebde82bde9ULL
+, 0xbef9a3f7b2c67915ULL
+, 0xc67178f2e372532bULL
+, 0xca273eceea26619cULL
+, 0xd186b8c721c0c207ULL
+, 0xeada7dd6cde0eb1eULL
+, 0xf57d4f7fee6ed178ULL
+, 0x06f067aa72176fbaULL
+, 0x0a637dc5a2c898a6ULL
+, 0x113f9804bef90daeULL
+, 0x1b710b35131c471bULL
+, 0x28db77f523047d84ULL
+, 0x32caab7b40c72493ULL
+, 0x3c9ebe0a15c9bebcULL
+, 0x431d67c49c100d4cULL
+, 0x4cc5d4becb3e42b6ULL
+, 0x597f299cfc657e2aULL
+, 0x5fcb6fab3ad6faecULL
+, 0x6c44198c4a475817ULL
+};
+
+int crypto_hashblocks(unsigned char *statebytes,const unsigned char *in,long long inlen)
+{
+  uint64 w[16];
+  uint64 state[8];
+  uint64 r[8];
+  int i;
+
+  for (i = 0;i < 8;++i)
+    state[i] = r[i] = load_bigendian(statebytes+8*i);
+
+  while (inlen >= 128) {
+    for (i = 0;i < 16;++i)
+      w[i] = load_bigendian(in+8*i);
+
+    handle(r,w,round+0);
+
+    expand(w);
+
+    handle(r,w,round+16);
+
+    expand(w);
+
+    handle(r,w,round+32);
+
+    expand(w);
+
+    handle(r,w,round+48);
+
+    expand(w);
+
+    handle(r,w,round+64);
+
+    for (i = 0;i < 8;++i) {
+      uint64 x = r[i]+state[i];
+      state[i] = x;
+      r[i] = x;
+    }
+      
+    in += 128;
+    inlen -= 128;
+  }
+
+  for (i = 0;i < 8;++i)
+    store_bigendian(statebytes+8*i,state[i]);
+
+  return inlen;
+}
@@ -0,0 +1 @@
+../m3/implementors
@@ -0,0 +1 @@
+../m3/api.h
@@ -0,0 +1,170 @@
+#include "crypto_hashblocks.h"
+
+typedef unsigned long long uint64;
+
+static uint64 load_bigendian(const unsigned char *x)
+{
+  return
+      (uint64) (x[7]) \
+  | (((uint64) (x[6])) << 8) \
+  | (((uint64) (x[5])) << 16) \
+  | (((uint64) (x[4])) << 24) \
+  | (((uint64) (x[3])) << 32) \
+  | (((uint64) (x[2])) << 40) \
+  | (((uint64) (x[1])) << 48) \
+  | (((uint64) (x[0])) << 56)
+  ;
+}
+
+static void store_bigendian(unsigned char *x,uint64 u)
+{
+  x[7] = u; u >>= 8;
+  x[6] = u; u >>= 8;
+  x[5] = u; u >>= 8;
+  x[4] = u; u >>= 8;
+  x[3] = u; u >>= 8;
+  x[2] = u; u >>= 8;
+  x[1] = u; u >>= 8;
+  x[0] = u;
+}
+
+#define SHR(x,c) ((x) >> (c))
+#define ROTR(x,c) (((x) >> (c)) | ((x) << (64 - (c))))
+#define sigma0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x,7))
+#define sigma1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x,6))
+
+#define Ch(x,y,z) (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) ((x & (y ^ z)) ^ (y & z))
+#define Sigma0(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39))
+#define Sigma1(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41))
+
+static const uint64 round[80] = {
+  0x428a2f98d728ae22ULL
+, 0x7137449123ef65cdULL
+, 0xb5c0fbcfec4d3b2fULL
+, 0xe9b5dba58189dbbcULL
+, 0x3956c25bf348b538ULL
+, 0x59f111f1b605d019ULL
+, 0x923f82a4af194f9bULL
+, 0xab1c5ed5da6d8118ULL
+, 0xd807aa98a3030242ULL
+, 0x12835b0145706fbeULL
+, 0x243185be4ee4b28cULL
+, 0x550c7dc3d5ffb4e2ULL
+, 0x72be5d74f27b896fULL
+, 0x80deb1fe3b1696b1ULL
+, 0x9bdc06a725c71235ULL
+, 0xc19bf174cf692694ULL
+, 0xe49b69c19ef14ad2ULL
+, 0xefbe4786384f25e3ULL
+, 0x0fc19dc68b8cd5b5ULL
+, 0x240ca1cc77ac9c65ULL
+, 0x2de92c6f592b0275ULL
+, 0x4a7484aa6ea6e483ULL
+, 0x5cb0a9dcbd41fbd4ULL
+, 0x76f988da831153b5ULL
+, 0x983e5152ee66dfabULL
+, 0xa831c66d2db43210ULL
+, 0xb00327c898fb213fULL
+, 0xbf597fc7beef0ee4ULL
+, 0xc6e00bf33da88fc2ULL
+, 0xd5a79147930aa725ULL
+, 0x06ca6351e003826fULL
+, 0x142929670a0e6e70ULL
+, 0x27b70a8546d22ffcULL
+, 0x2e1b21385c26c926ULL
+, 0x4d2c6dfc5ac42aedULL
+, 0x53380d139d95b3dfULL
+, 0x650a73548baf63deULL
+, 0x766a0abb3c77b2a8ULL
+, 0x81c2c92e47edaee6ULL
+, 0x92722c851482353bULL
+, 0xa2bfe8a14cf10364ULL
+, 0xa81a664bbc423001ULL
+, 0xc24b8b70d0f89791ULL
+, 0xc76c51a30654be30ULL
+, 0xd192e819d6ef5218ULL
+, 0xd69906245565a910ULL
+, 0xf40e35855771202aULL
+, 0x106aa07032bbd1b8ULL
+, 0x19a4c116b8d2d0c8ULL
+, 0x1e376c085141ab53ULL
+, 0x2748774cdf8eeb99ULL
+, 0x34b0bcb5e19b48a8ULL
+, 0x391c0cb3c5c95a63ULL
+, 0x4ed8aa4ae3418acbULL
+, 0x5b9cca4f7763e373ULL
+, 0x682e6ff3d6b2b8a3ULL
+, 0x748f82ee5defb2fcULL
+, 0x78a5636f43172f60ULL
+, 0x84c87814a1f0ab72ULL
+, 0x8cc702081a6439ecULL
+, 0x90befffa23631e28ULL
+, 0xa4506cebde82bde9ULL
+, 0xbef9a3f7b2c67915ULL
+, 0xc67178f2e372532bULL
+, 0xca273eceea26619cULL
+, 0xd186b8c721c0c207ULL
+, 0xeada7dd6cde0eb1eULL
+, 0xf57d4f7fee6ed178ULL
+, 0x06f067aa72176fbaULL
+, 0x0a637dc5a2c898a6ULL
+, 0x113f9804bef90daeULL
+, 0x1b710b35131c471bULL
+, 0x28db77f523047d84ULL
+, 0x32caab7b40c72493ULL
+, 0x3c9ebe0a15c9bebcULL
+, 0x431d67c49c100d4cULL
+, 0x4cc5d4becb3e42b6ULL
+, 0x597f299cfc657e2aULL
+, 0x5fcb6fab3ad6faecULL
+, 0x6c44198c4a475817ULL
+};
+
+int crypto_hashblocks(unsigned char *statebytes,const unsigned char *in,long long inlen)
+{
+  uint64 w[16];
+  uint64 state[8];
+  uint64 r[8];
+  uint64 x;
+  int i;
+
+  for (i = 0;i < 8;++i)
+    state[i] = r[i] = load_bigendian(statebytes+8*i);
+
+  while (inlen >= 128) {
+    for (i = 0;i < 16;++i)
+      w[i] = load_bigendian(in+8*i);
+
+    for (i = 0;i < 80;++i) {
+      x = r[7&(7-i)];
+      x += w[15&i];
+      x += round[i];
+      x += Sigma1(r[7&(4-i)]);
+      x += Ch(r[7&(4-i)],r[7&(5-i)],r[7&(6-i)]);
+      r[7&(3-i)] += x;
+      x += Sigma0(r[7&(0-i)]);
+      x += Maj(r[7&(0-i)],r[7&(1-i)],r[7&(2-i)]);
+      r[7&(7-i)] = x;
+
+      /* not used for i >= 64: */
+      w[15&i] += w[15&(i+9)];
+      w[15&i] += sigma1(w[15&(i+14)]);
+      w[15&i] += sigma0(w[15&(i+1)]);
+    }
+
+    for (i = 0;i < 8;++i) {
+      uint64 x = r[i]+state[i];
+      state[i] = x;
+      r[i] = x;
+    }
+      
+    in += 128;
+    inlen -= 128;
+  }
+
+  for (i = 0;i < 8;++i)
+    store_bigendian(statebytes+8*i,state[i]);
+
+  return inlen;
+}
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`../../../crypto_pow/inv25519/sandy2x/architectures`
				`@@ -0,0 +1 @@`
				`Daniel J. Bernstein (wrapper around crypto_hashblocks/sha512)`