-
Notifications
You must be signed in to change notification settings - Fork 145
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
481 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,377 @@ | ||
// k12 implements the KangarooTwelve XOF. | ||
// | ||
// KangarooTwelve is being standardised at the CFFRG working group | ||
// of the IRTF. This package implements draft 10. | ||
// | ||
// https://datatracker.ietf.org/doc/draft-irtf-cfrg-kangarootwelve/10/ | ||
package k12 | ||
|
||
import ( | ||
"encoding/binary" | ||
|
||
"github.com/cloudflare/circl/internal/sha3" | ||
"github.com/cloudflare/circl/simd/keccakf1600" | ||
) | ||
|
||
const chunkSize = 8192 // aka B | ||
|
||
// KangarooTwelve splits the message into chunks of 8192 bytes each. | ||
// The first chunk is absorbed directly in a TurboSHAKE128 instance, which | ||
// we call the stalk. The subsequent chunks aren't absorbed directly, but | ||
// instead their hash is absorbed: they're like leafs on a stalk. | ||
// If we have a fast TurboSHAKE128 available, we buffer chunks until we have | ||
// enough to do the parallel TurboSHAKE128. If not, we absorb directly into | ||
// a separate TurboSHAKE128 state. | ||
|
||
type State struct { | ||
initialTodo int // Bytes left to absorb for the first chunk. | ||
|
||
stalk sha3.State | ||
|
||
context []byte // context string "C" provided by the user | ||
|
||
// buffer of incoming data so we can do parallel TurboSHAKE128: | ||
// nil when we haven't aborbed the first chunk yet; | ||
// empty if we have, but we do not have a fast parallel TurboSHAKE128; | ||
// and chunkSize*lanes in length if we have. | ||
buf []byte | ||
|
||
offset int // offset in buf or bytes written to leaf | ||
|
||
// Number of chunk hashes ("CV_i") absorbed into the stalk. | ||
chunk uint | ||
|
||
// TurboSHAKE128 instance to compute the leaf in case we don't have | ||
// a fast parallel TurboSHAKE128, viz when lanes == 1. | ||
leaf *sha3.State | ||
|
||
lanes uint8 // number of TurboSHAKE128s to compute in parallel | ||
} | ||
|
||
// NewDraft10 creates a new instance of Kangaroo12 draft version -10. | ||
func NewDraft10(c []byte) State { | ||
var lanes byte = 1 | ||
|
||
if keccakf1600.IsEnabledX4() { | ||
lanes = 4 | ||
} else if keccakf1600.IsEnabledX2() { | ||
lanes = 2 | ||
} | ||
|
||
return newDraft10(c, lanes) | ||
} | ||
|
||
func newDraft10(c []byte, lanes byte) State { | ||
return State{ | ||
initialTodo: chunkSize, | ||
stalk: sha3.NewTurboShake128(0x07), | ||
context: c, | ||
lanes: lanes, | ||
} | ||
} | ||
|
||
func (s *State) Reset() { | ||
s.initialTodo = chunkSize | ||
s.stalk.Reset() | ||
s.stalk.SwitchDS(0x07) | ||
s.buf = nil | ||
s.offset = 0 | ||
s.chunk = 0 | ||
} | ||
|
||
func Draft10Sum(hash []byte, msg []byte, c []byte) { | ||
// TODO Tweak number of lanes depending on the length of the message | ||
s := NewDraft10(c) | ||
_, _ = s.Write(msg) | ||
_, _ = s.Read(hash) | ||
} | ||
|
||
func (s *State) Write(p []byte) (int, error) { | ||
written := len(p) | ||
|
||
// The first chunk is written directly to the stalk. | ||
if s.initialTodo > 0 { | ||
taken := s.initialTodo | ||
if len(p) < taken { | ||
taken = len(p) | ||
} | ||
headP := p[:taken] | ||
_, _ = s.stalk.Write(headP) | ||
s.initialTodo -= taken | ||
p = p[taken:] | ||
} | ||
|
||
if len(p) == 0 { | ||
return written, nil | ||
} | ||
|
||
// If this is the first bit of data written after the initial chunk, | ||
// we're out of the fast-path and allocate some buffers. | ||
if s.buf == nil { | ||
if s.lanes != 1 { | ||
s.buf = make([]byte, int(s.lanes)*chunkSize) | ||
} else { | ||
// We create the buffer to signal we're past the first chunk, | ||
// but do not use it. | ||
s.buf = make([]byte, 0) | ||
h := sha3.NewTurboShake128(0x0B) | ||
s.leaf = &h | ||
} | ||
_, _ = s.stalk.Write([]byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}) | ||
s.stalk.SwitchDS(0x06) | ||
} | ||
|
||
// If we're just using one lane, we don't need to cache in a buffer | ||
// for parallel hashing. Instead, we feed directly to TurboSHAKE. | ||
if s.lanes == 1 { | ||
for len(p) > 0 { | ||
// Write to current leaf. | ||
to := chunkSize - s.offset | ||
if len(p) < to { | ||
to = len(p) | ||
} | ||
_, _ = s.leaf.Write(p[:to]) | ||
p = p[to:] | ||
s.offset += to | ||
|
||
// Did we fill the chunk? | ||
if s.offset == chunkSize { | ||
var cv [32]byte | ||
_, _ = s.leaf.Read(cv[:]) | ||
_, _ = s.stalk.Write(cv[:]) | ||
s.leaf.Reset() | ||
s.offset = 0 | ||
s.chunk++ | ||
} | ||
} | ||
|
||
return written, nil | ||
} | ||
|
||
// If we can't fill all our lanes or the buffer isn't empty, we write the | ||
// data to the buffer. | ||
if s.offset != 0 || len(p) < len(s.buf) { | ||
to := len(s.buf) - s.offset | ||
if len(p) < to { | ||
to = len(p) | ||
} | ||
p2 := p[:to] | ||
p = p[to:] | ||
copy(s.buf[s.offset:], p2) | ||
s.offset += to | ||
} | ||
|
||
// Absorb the buffer if we filled it | ||
if s.offset == len(s.buf) { | ||
s.writeX(s.buf) | ||
s.offset = 0 | ||
} | ||
|
||
// Note that at this point we may assume that s.offset = 0 if len(p) != 0 | ||
if len(p) != 0 && s.offset != 0 { | ||
panic("shouldn't happen") | ||
} | ||
|
||
// Absorb a bunch of chunks at the same time. | ||
if len(p) >= int(s.lanes)*chunkSize { | ||
p = s.writeX(p) | ||
} | ||
|
||
// Put the remainder in the buffer. | ||
if len(p) > 0 { | ||
copy(s.buf, p) | ||
s.offset = len(p) | ||
} | ||
|
||
return written, nil | ||
} | ||
|
||
// Absorb a multiple of a multiple of lanes * chunkSize. | ||
// Returns the remainder. | ||
func (s *State) writeX(p []byte) []byte { | ||
switch s.lanes { | ||
case 4: | ||
return s.writeX4(p) | ||
default: | ||
return s.writeX2(p) | ||
} | ||
} | ||
|
||
func (s *State) writeX4(p []byte) []byte { | ||
for len(p) >= 4*chunkSize { | ||
var x4 keccakf1600.StateX4 | ||
a := x4.Initialize(true) | ||
|
||
for offset := 0; offset < 48*168; offset += 168 { | ||
for i := 0; i < 21; i++ { | ||
a[i*4] ^= binary.LittleEndian.Uint64( | ||
p[8*i+offset:], | ||
) | ||
a[i*4+1] ^= binary.LittleEndian.Uint64( | ||
p[chunkSize+8*i+offset:], | ||
) | ||
a[i*4+2] ^= binary.LittleEndian.Uint64( | ||
p[chunkSize*2+8*i+offset:], | ||
) | ||
a[i*4+3] ^= binary.LittleEndian.Uint64( | ||
p[chunkSize*3+8*i+offset:], | ||
) | ||
} | ||
|
||
x4.Permute() | ||
} | ||
|
||
for i := 0; i < 16; i++ { | ||
a[i*4] ^= binary.LittleEndian.Uint64( | ||
p[8*i+48*168:], | ||
) | ||
a[i*4+1] ^= binary.LittleEndian.Uint64( | ||
p[chunkSize+8*i+48*168:], | ||
) | ||
a[i*4+2] ^= binary.LittleEndian.Uint64( | ||
p[chunkSize*2+8*i+48*168:], | ||
) | ||
a[i*4+3] ^= binary.LittleEndian.Uint64( | ||
p[chunkSize*3+8*i+48*168:], | ||
) | ||
} | ||
|
||
a[16*4] ^= 0x0b | ||
a[16*4+1] ^= 0x0b | ||
a[16*4+2] ^= 0x0b | ||
a[16*4+3] ^= 0x0b | ||
a[20*4] ^= 0x80 << 56 | ||
a[20*4+1] ^= 0x80 << 56 | ||
a[20*4+2] ^= 0x80 << 56 | ||
a[20*4+3] ^= 0x80 << 56 | ||
|
||
x4.Permute() | ||
|
||
var buf [32 * 4]byte | ||
for i := 0; i < 4; i++ { | ||
binary.LittleEndian.PutUint64(buf[8*i:], a[4*i]) | ||
binary.LittleEndian.PutUint64(buf[32+8*i:], a[4*i+1]) | ||
binary.LittleEndian.PutUint64(buf[32*2+8*i:], a[4*i+2]) | ||
binary.LittleEndian.PutUint64(buf[32*3+8*i:], a[4*i+3]) | ||
} | ||
|
||
_, _ = s.stalk.Write(buf[:]) | ||
p = p[chunkSize*4:] | ||
s.chunk += 4 | ||
} | ||
|
||
return p | ||
} | ||
|
||
func (s *State) writeX2(p []byte) []byte { | ||
// TODO On M2 Pro, 1/3 of the time is spent on this function | ||
// and LittleEndian.Uint64 excluding the actual permutation. | ||
// Rewriting in assembler might be worthwhile. | ||
for len(p) >= 2*chunkSize { | ||
var x2 keccakf1600.StateX2 | ||
a := x2.Initialize(true) | ||
|
||
for offset := 0; offset < 48*168; offset += 168 { | ||
for i := 0; i < 21; i++ { | ||
a[i*2] ^= binary.LittleEndian.Uint64( | ||
p[8*i+offset:], | ||
) | ||
a[i*2+1] ^= binary.LittleEndian.Uint64( | ||
p[chunkSize+8*i+offset:], | ||
) | ||
} | ||
|
||
x2.Permute() | ||
} | ||
|
||
for i := 0; i < 16; i++ { | ||
a[i*2] ^= binary.LittleEndian.Uint64( | ||
p[8*i+48*168:], | ||
) | ||
a[i*2+1] ^= binary.LittleEndian.Uint64( | ||
p[chunkSize+8*i+48*168:], | ||
) | ||
} | ||
|
||
a[16*2] ^= 0x0b | ||
a[16*2+1] ^= 0x0b | ||
a[20*2] ^= 0x80 << 56 | ||
a[20*2+1] ^= 0x80 << 56 | ||
|
||
x2.Permute() | ||
|
||
var buf [32 * 2]byte | ||
for i := 0; i < 4; i++ { | ||
binary.LittleEndian.PutUint64(buf[8*i:], a[2*i]) | ||
binary.LittleEndian.PutUint64(buf[32+8*i:], a[2*i+1]) | ||
} | ||
|
||
_, _ = s.stalk.Write(buf[:]) | ||
p = p[chunkSize*2:] | ||
s.chunk += 2 | ||
} | ||
|
||
return p | ||
} | ||
|
||
func (s *State) Read(p []byte) (int, error) { | ||
if s.stalk.IsAbsorbing() { | ||
// Write context string C | ||
_, _ = s.Write(s.context) | ||
|
||
// Write length_encode( |C| ) | ||
var buf [9]byte | ||
binary.BigEndian.PutUint64(buf[:8], uint64(len(s.context))) | ||
|
||
// Find first non-zero digit in big endian encoding of context length | ||
i := 0 | ||
for buf[i] == 0 && i < 8 { | ||
i++ | ||
} | ||
|
||
buf[8] = byte(8 - i) // number of bytes to represent |C| | ||
_, _ = s.Write(buf[i:]) | ||
|
||
// We need to write the chunk number if we're past the first chunk. | ||
if s.buf != nil { | ||
// Write last remaining chunk(s) | ||
var cv [32]byte | ||
if s.lanes == 1 { | ||
if s.offset != 0 { | ||
_, _ = s.leaf.Read(cv[:]) | ||
_, _ = s.stalk.Write(cv[:]) | ||
s.chunk++ | ||
} | ||
} else { | ||
remainingBuf := s.buf[:s.offset] | ||
for len(remainingBuf) > 0 { | ||
h := sha3.NewTurboShake128(0x0B) | ||
to := chunkSize | ||
if len(remainingBuf) < to { | ||
to = len(remainingBuf) | ||
} | ||
_, _ = h.Write(remainingBuf[:to]) | ||
_, _ = h.Read(cv[:]) | ||
_, _ = s.stalk.Write(cv[:]) | ||
s.chunk++ | ||
remainingBuf = remainingBuf[to:] | ||
} | ||
} | ||
|
||
// Write length_encode( chunk ) | ||
binary.BigEndian.PutUint64(buf[:8], uint64(s.chunk)) | ||
|
||
// Find first non-zero digit in big endian encoding of number of chunks | ||
i = 0 | ||
for buf[i] == 0 && i < 8 { | ||
i++ | ||
} | ||
|
||
buf[8] = byte(8 - i) // number of bytes to represent number of chunks. | ||
_, _ = s.stalk.Write(buf[i:]) | ||
_, _ = s.stalk.Write([]byte{0xff, 0xff}) | ||
} | ||
} | ||
|
||
return s.stalk.Read(p) | ||
} |
Oops, something went wrong.