Skip to content

Commit

Permalink
Fix stack frame size errors on ARM32
Browse files Browse the repository at this point in the history
- Don't unroll loops in Skein on 32-bit to save stack space
- Add memory barriers in sha2.c on 32-bit to save stack space
- Add comments to "uninitilized" variables in edonr_byteorder.h asm
  • Loading branch information
tonyhutter committed Jul 11, 2016
1 parent 145c821 commit b9f861f
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 53 deletions.
26 changes: 6 additions & 20 deletions module/icp/algs/edonr/edonr.c
Original file line number Diff line number Diff line change
Expand Up @@ -337,34 +337,20 @@ Q256(size_t bitlen, const uint32_t *data, uint32_t *restrict p)
return (bitlen - bl);
}


#pragma GCC diagnostic ignored "-Wframe-larger-than="
/*
* Why is this #pragma here?
*
* Checksum functions like this one can go over the stack frame size check
* Linux imposes on 32-bit platforms (-Wframe-larger-than=1024). We can
* safely ignore the compiler error since we know that in ZoL, that
* the function will be called from a worker thread that won't be using
* much stack.
*
* The reason for the high stack frame usage comes from the large number of
* calculation permutations that are going on. When you have so many local
* variables all doing calculations on each other, it forces the compiler to
* create lots of additional temporary local variables as part of its Single
* Static Assignment (SSA) step. The only way to get around this would be
* to break up the calculation into parts (in separate functions). We don't
* want to do this because:
*
* 1. We want to keep these routines as much in common with the Illumos code
* as possible. Checksum code isn't something you should be changing
* wily-nily.
*
* 2. It's hard to break up these functions since since they need to maintain
* a lot of local state to compute the checksums. You would have to pass
* most or all of these local variables back and forth with another
* function, which can get really ugly...
* much stack. The only function that goes over the 1k limit is Q512(),
* which only goes over it by a hair (1248 bytes on ARM32).
*/
#include <sys/isa_defs.h> /* for _ILP32 */
#ifdef _ILP32 /* We're 32-bit, assume small stack frames */
#pragma GCC diagnostic ignored "-Wframe-larger-than="
#endif

#if defined(__IBMC__) && defined(_AIX) && defined(__64BIT__)
static inline size_t
Expand Down
8 changes: 4 additions & 4 deletions module/icp/algs/edonr/edonr_byteorder.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,15 @@
#else
#define aix_ld_swap64(s64, d64) \
{ \
uint64_t *s4 = 0, h; \
uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */ \
\
__asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0;rldimi %1,%2,32,0"\
: "+r"(s4), "=r"(d64), "=r"(h) : "b"(s64)); \
}

#define aix_st_swap64(s64, d64) \
{ \
uint64_t *s4 = 0, h; \
uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */ \
h = (s64) >> 32; \
__asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0" \
: "+r"(s4) : "r"(s64), "r"(h), "b"(d64)); \
Expand All @@ -103,15 +103,15 @@
#else
#define aix_ld_swap64(s64, d64) \
{ \
uint32_t *s4 = 0, h, l; \
uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\
__asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0" \
: "+r"(s4), "=r"(l), "=r"(h) : "b"(s64)); \
d64 = ((uint64_t)h<<32) | l; \
}

#define aix_st_swap64(s64, d64) \
{ \
uint32_t *s4 = 0, h, l; \
uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\
l = (s64) & 0xfffffffful, h = (s64) >> 32; \
__asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0" \
: "+r"(s4) : "r"(l), "r"(h), "b"(d64)); \
Expand Down
16 changes: 15 additions & 1 deletion module/icp/algs/sha2/sha2.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include <sys/byteorder.h>
#define HAVE_HTONL
#endif
#include <sys/isa_defs.h> /* for _ILP32 */

static void Encode(uint8_t *, uint32_t *, size_t);
static void Encode64(uint8_t *, uint64_t *, size_t);
Expand All @@ -65,6 +66,18 @@ static void SHA512Transform(SHA2_CTX *, const uint8_t *);

static uint8_t PADDING[128] = { 0x80, /* all zeros */ };

/*
* The low-level checksum routines use a lot of stack space. On systems where
* small stacks are enforced (like 32-bit kernel builds), insert compiler memory
* barriers to reduce stack frame size. This can reduce the SHA512Transform()
* stack frame usage from 3k to <1k on ARM32, for example.
*/
#if defined(_ILP32) || defined(__powerpc) /* small stack */
#define SMALL_STACK_MEMORY_BARRIER asm volatile("": : :"memory");
#else
#define SMALL_STACK_MEMORY_BARRIER
#endif

/* Ch and Maj are the basic SHA2 functions. */
#define Ch(b, c, d) (((b) & (c)) ^ ((~b) & (d)))
#define Maj(b, c, d) (((b) & (c)) ^ ((b) & (d)) ^ ((c) & (d)))
Expand Down Expand Up @@ -97,7 +110,8 @@ static uint8_t PADDING[128] = { 0x80, /* all zeros */ };
T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + SHA512_CONST(i) + w; \
d += T1; \
T2 = BIGSIGMA0(a) + Maj(a, b, c); \
h = T1 + T2
h = T1 + T2; \
SMALL_STACK_MEMORY_BARRIER;

/*
* sparc optimization:
Expand Down
53 changes: 25 additions & 28 deletions module/icp/algs/skein/skein_block.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,39 @@

#include <sys/skein.h>
#include "skein_impl.h"
#include <sys/isa_defs.h> /* for _ILP32 */

#ifndef SKEIN_USE_ASM
#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
#endif

#ifndef SKEIN_LOOP
/*
* The low-level checksum routines use a lot of stack space. On systems where
* small stacks frame are enforced (like 32-bit kernel builds), do not unroll
* checksum calculations to save stack space.
*
* Even with no loops unrolled, we still can exceed the 1k stack frame limit
* in Skein1024_Process_Block() (it hits 1272 bytes on ARM32). We can
* safely ignore it though, since that the checksum functions will be called
* from a worker thread that won't be using much stack. That's why we have
* the #pragma here to ignore the warning.
*/
#if defined(_ILP32) || defined(__powerpc) /* Assume small stack */
#pragma GCC diagnostic ignored "-Wframe-larger-than="
/*
* We're running on 32-bit, don't unroll loops to save stack frame space
*
* Due to the ways the calculations on SKEIN_LOOP are done in
* Skein_*_Process_Block(), a value of 111 disables unrolling loops
* in any of those functions.
*/
#define SKEIN_LOOP 111
#else
/* We're compiling with large stacks */
#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
#endif
#endif

/* some useful definitions for code here */
#define BLK_BITS (WCNT*64)
Expand All @@ -33,34 +58,6 @@
/* Skein_256 */
#if !(SKEIN_USE_ASM & 256)


#pragma GCC diagnostic ignored "-Wframe-larger-than="
/*
* Why is this #pragma here?
*
* Checksum functions like this one can go over the stack frame size check
* Linux imposes on 32-bit platforms (-Wframe-larger-than=1024). We can
* safely ignore the compiler error since we know that in ZoL, that
* the function will be called from a worker thread that won't be using
* much stack.
*
* The reason for the high stack frame usage comes from the large number of
* calculation permutations that are going on. When you have so many local
* variables all doing calculations on each other, it forces the compiler to
* create lots of additional temporary local variables as part of its Single
* Static Assignment (SSA) step. The only way to get around this would be
* to break up the calculation into parts (in separate functions). We don't
* want to do this because:
*
* 1. We want to keep these routines as much in common with the Illumos code
* as possible. Checksum code isn't something you should be changing
* wily-nily.
*
* 2. It's hard to break up these functions since since they need to maintain
* a lot of local state to compute the checksums. You would have to pass
* most or all of these local variables back and forth with another
* function, which can get really ugly...
*/
void
Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
size_t blkCnt, size_t byteCntAdd)
Expand Down

0 comments on commit b9f861f

Please sign in to comment.