From 849baf41d8cba5d9e023c32dfd4c4f98cceda095 Mon Sep 17 00:00:00 2001
From: David Palm <dvdplm@gmail.com>
Date: Thu, 17 May 2018 11:27:50 +0200
Subject: [PATCH 1/2] Don't unroll outer loop

Not unrolling the outer loop seems to speed up hashing quite significally:

Original (unrolled):
```
running 3 tests
test bench_keccak_256_with_empty_input   ... bench:         557 ns/iter (+/- 46)
test bench_keccak_256_with_large_input   ... bench:      17,288 ns/iter (+/- 1,871) = 236 MB/s
test bench_keccak_256_with_typical_input ... bench:         577 ns/iter (+/- 28) = 88 MB/s
```

This branch (not unrolled):
```
running 3 tests
test bench_keccak_256_with_empty_input   ... bench:         487 ns/iter (+/- 25)
test bench_keccak_256_with_large_input   ... bench:      14,645 ns/iter (+/- 675) = 279 MB/s
test bench_keccak_256_with_typical_input ... bench:         495 ns/iter (+/- 32) = 103 MB/s
```

"Inspired" by https://github.com/RustCrypto/sponges/blob/master/keccak/src/lib.rs#L138

Running benchmarks from the `keccak-hash` crate so we can compare to the numbers [here](https://github.com/paritytech/keccak-hash/pull/1).
---
 .gitignore |  1 +
 src/lib.rs | 98 ++++++++++++++++++++++++++----------------------------
 2 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/.gitignore b/.gitignore
index d4f917d..9b38af7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 target
 Cargo.lock
 *.swp
+.idea
diff --git a/src/lib.rs b/src/lib.rs
index 877a527..c33c18b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -67,72 +67,70 @@ const RC: [u64; 24] = [
 /// keccak-f[1600]
 pub fn keccakf(a: &mut [u64; PLEN]) {
     let mut arrays: [[u64; 5]; 24] = [[0; 5]; 24];
-
-    unroll! {
-        for i in 0..24 {
-            // Theta
-            unroll! {
-                for x in 0..5 {
-                    // This looks useless but it gets way slower without it. I tried using
-                    // `mem::uninitialized` for the initialisation of `arrays` but that also makes
-                    // it slower, although not by as much as removing this assignment. Optimisers
-                    // are weird. Maybe a different version of LLVM will react differently, so if
-                    // you see this comment in the future try deleting this assignment and using
-                    // uninit above and see how it affects the benchmarks.
-                    arrays[i][x] = 0;
-
-                    unroll! {
-                        for y_count in 0..5 {
-                            let y = y_count * 5;
-                            arrays[i][x] ^= a[x + y];
-                        }
+    // Not unrolling this is faster, see https://github.com/RustCrypto/sponges/blob/master/keccak/src/lib.rs#L138
+    for i in 0..24 {
+        // Theta
+        unroll! {
+            for x in 0..5 {
+                // This looks useless but it gets way slower without it. I tried using
+                // `mem::uninitialized` for the initialisation of `arrays` but that also makes
+                // it slower, although not by as much as removing this assignment. Optimisers
+                // are weird. Maybe a different version of LLVM will react differently, so if
+                // you see this comment in the future try deleting this assignment and using
+                // uninit above and see how it affects the benchmarks.
+                arrays[i][x] = 0;
+
+                unroll! {
+                    for y_count in 0..5 {
+                        let y = y_count * 5;
+                        arrays[i][x] ^= a[x + y];
                     }
                 }
             }
+        }
 
-            unroll! {
-                for x in 0..5 {
-                    unroll! {
-                        for y_count in 0..5 {
-                            let y = y_count * 5;
-                            a[y + x] ^= arrays[i][(x + 4) % 5] ^ arrays[i][(x + 1) % 5].rotate_left(1);
-                        }
+        unroll! {
+            for x in 0..5 {
+                unroll! {
+                    for y_count in 0..5 {
+                        let y = y_count * 5;
+                        a[y + x] ^= arrays[i][(x + 4) % 5] ^ arrays[i][(x + 1) % 5].rotate_left(1);
                     }
                 }
             }
+        }
 
-            // Rho and pi
-            let mut last = a[1];
-            unroll! {
-                for x in 0..24 {
-                    arrays[i][0] = a[PI[x]];
-                    a[PI[x]] = last.rotate_left(RHO[x]);
-                    last = arrays[i][0];
-                }
+        // Rho and pi
+        let mut last = a[1];
+        unroll! {
+            for x in 0..24 {
+                arrays[i][0] = a[PI[x]];
+                a[PI[x]] = last.rotate_left(RHO[x]);
+                last = arrays[i][0];
             }
+        }
 
-            // Chi
-            unroll! {
-                for y_step in 0..5 {
-                    let y = y_step * 5;
+        // Chi
+        unroll! {
+            for y_step in 0..5 {
+                let y = y_step * 5;
 
-                    unroll! {
-                        for x in 0..5 {
-                            arrays[i][x] = a[y + x];
-                        }
+                unroll! {
+                    for x in 0..5 {
+                        arrays[i][x] = a[y + x];
                     }
+                }
 
-                    unroll! {
-                        for x in 0..5 {
-                            a[y + x] = arrays[i][x] ^ ((!arrays[i][(x + 1) % 5]) & (arrays[i][(x + 2) % 5]));
-                        }
+                unroll! {
+                    for x in 0..5 {
+                        a[y + x] = arrays[i][x] ^ ((!arrays[i][(x + 1) % 5]) & (arrays[i][(x + 2) % 5]));
                     }
                 }
-            };
+            }
+        };
 
-            // Iota
-            a[0] ^= RC[i];
-        }
+        // Iota
+        a[0] ^= RC[i];
     }
 }
 

From 40a1fe5841dc9bcec17c47232b2ac3ce2ee135e1 Mon Sep 17 00:00:00 2001
From: David Palm <dvdplm@gmail.com>
Date: Fri, 18 May 2018 08:56:46 +0200
Subject: [PATCH 2/2] Add test for hashing with same buffer for input and
 output

---
 tests/test.rs | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/tests/test.rs b/tests/test.rs
index 054cafd..251a5bb 100644
--- a/tests/test.rs
+++ b/tests/test.rs
@@ -1,4 +1,5 @@
 extern crate tiny_keccak;
+extern crate core;
 
 use tiny_keccak::*;
 
@@ -20,20 +21,26 @@ fn empty_keccak() {
 }
 
 #[test]
-fn string_keccak_256() {
-    let mut keccak_256 = Keccak::new_keccak256();
-    let data: Vec<u8> = From::from("hello world");
-    keccak_256.update(&data);
-    let mut res : [u8;32] = [0;32];
-    keccak_256.finalize(&mut res);
-    let expected = vec![
-        0x47, 0x17, 0x32, 0x85, 0xa8, 0xd7, 0x34, 0x1e, 
-        0x5e, 0x97, 0x2f, 0xc6, 0x77, 0x28, 0x63, 0x84, 
-        0xf8, 0x02, 0xf8, 0xef, 0x42, 0xa5, 0xec, 0x5f, 
-        0x03, 0xbb, 0xfa, 0x25, 0x4c, 0xb0, 0x1f, 0xad
-    ];
-    let ref_ex: &[u8] = &expected;
-    assert_eq!(&res, ref_ex);
+fn string_keccak_256_overlapping_buffer() {
+    let mut in_and_out : [u8; 32] = [0;32];
+    for i in 1..6 { in_and_out[i as usize - 1] = i }
+
+    let ptr = in_and_out.as_mut_ptr();
+    Keccak::keccak256(
+        unsafe {
+            core::slice::from_raw_parts(ptr, 5) // read a piece from start of in_and_out
+        },
+        &mut in_and_out, // write over the whole array
+    );
+
+    let expected = vec![125, 135, 197, 234, 117, 247, 55, 139, 183, 1, 228, 4, 197, 6, 57, 22, 26, 243, 239, 246, 98, 147, 233, 243, 117, 181, 241, 126, 181, 4, 118, 244];
+    assert_eq!(&in_and_out, &expected.as_ref());
+
+    // Verify using overlapping in/out buffers yields same result as a "normal" hash
+    let control_in : [u8;5] = [1,2,3,4,5];
+    let mut control_out : [u8;32] = [0;32];
+    Keccak::keccak256(&control_in, &mut control_out);
+    assert_eq!(&control_out, &in_and_out);
 }
 
 #[test]