Skip to content

Commit

Permalink
Add specialized reduction for P256 in pcurves
Browse files Browse the repository at this point in the history
For 32-bit x86, this reduction results in point arithmetic operations
that are 25-35% faster than when using Montgomery.

Sadly for 64-bit x86 it is at best about even with using Montgomery,
and for Clang 64-bit it's even somewhat slower.
  • Loading branch information
randombit committed Jun 24, 2024
1 parent d24c2c3 commit 1b1b0eb
Showing 1 changed file with 160 additions and 4 deletions.
164 changes: 160 additions & 4 deletions src/lib/math/pcurves/pcurves_secp256r1/pcurves_secp256r1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,159 @@ namespace Botan::PCurve {

namespace {

// clang-format off
template <WordType W>
constexpr uint32_t get_uint32(const W xw[], size_t i) {
static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);

if constexpr(WordInfo<W>::bits == 32) {
return xw[i];
} else {
return static_cast<uint32_t>(xw[i / 2] >> ((i % 2) * 32));
}
}

template <WordType W, size_t N>
class SumAccum {
public:
static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);

static constexpr size_t N32 = N * (WordInfo<W>::bits / 32);

SumAccum(std::array<W, N>& r) : m_r(r), m_S(0), m_idx(0) {}

void accum(int64_t v) {
BOTAN_STATE_CHECK(m_idx < N32);

m_S += v;
const uint32_t r = static_cast<uint32_t>(m_S);
m_S >>= 32;

if constexpr(WordInfo<W>::bits == 32) {
m_r[m_idx] = r;
} else {
m_r[m_idx / 2] |= static_cast<uint64_t>(r) << (32 * (m_idx % 2));
}

m_idx += 1;
}

W final_carry(int64_t C) {
BOTAN_STATE_CHECK(m_idx == N32);
m_S += C;
BOTAN_DEBUG_ASSERT(m_S >= 0);
return static_cast<W>(m_S);
}

private:
std::array<W, N>& m_r;
int64_t m_S;
size_t m_idx;
};

template <typename Params>
class Secp256r1Rep final {
public:
static constexpr auto P = Params::P;
static constexpr size_t N = Params::N;
typedef typename Params::W W;

// Adds 4 * P-256 to prevent underflow
static constexpr auto P256_4 =
hex_to_words<uint32_t>("0x3fffffffc00000004000000000000000000000003fffffffffffffffffffffffc");

constexpr static std::array<W, N> redc(const std::array<W, 2 * N>& z) {
const int64_t X00 = get_uint32(z.data(), 0);
const int64_t X01 = get_uint32(z.data(), 1);
const int64_t X02 = get_uint32(z.data(), 2);
const int64_t X03 = get_uint32(z.data(), 3);
const int64_t X04 = get_uint32(z.data(), 4);
const int64_t X05 = get_uint32(z.data(), 5);
const int64_t X06 = get_uint32(z.data(), 6);
const int64_t X07 = get_uint32(z.data(), 7);
const int64_t X08 = get_uint32(z.data(), 8);
const int64_t X09 = get_uint32(z.data(), 9);
const int64_t X10 = get_uint32(z.data(), 10);
const int64_t X11 = get_uint32(z.data(), 11);
const int64_t X12 = get_uint32(z.data(), 12);
const int64_t X13 = get_uint32(z.data(), 13);
const int64_t X14 = get_uint32(z.data(), 14);
const int64_t X15 = get_uint32(z.data(), 15);

const int64_t S0 = P256_4[0] + X00 + X08 + X09 - (X11 + X12 + X13 + X14);
const int64_t S1 = P256_4[1] + X01 + X09 + X10 - (X12 + X13 + X14 + X15);
const int64_t S2 = P256_4[2] + X02 + X10 + X11 - (X13 + X14 + X15);
const int64_t S3 = P256_4[3] + X03 + 2 * (X11 + X12) + X13 - (X15 + X08 + X09);
const int64_t S4 = P256_4[4] + X04 + 2 * (X12 + X13) + X14 - (X09 + X10);
const int64_t S5 = P256_4[5] + X05 + 2 * (X13 + X14) + X15 - (X10 + X11);
const int64_t S6 = P256_4[6] + X06 + X13 + X14 * 3 + X15 * 2 - (X08 + X09);
const int64_t S7 = P256_4[7] + X07 + X15 * 3 + X08 - (X10 + X11 + X12 + X13);
const int64_t S8 = P256_4[8];

std::array<W, N> r = {};

SumAccum sum(r);

sum.accum(S0);
sum.accum(S1);
sum.accum(S2);
sum.accum(S3);
sum.accum(S4);
sum.accum(S5);
sum.accum(S6);
sum.accum(S7);
const auto S = sum.final_carry(S8);

CT::unpoison(S);
BOTAN_ASSERT(S <= 8, "Expected overflow");

const auto correction = p256_mul_mod_256(S);
W borrow = bigint_sub2(r.data(), N, correction.data(), N);

bigint_cnd_add(borrow, r.data(), N, P.data(), N);

return r;
}

constexpr static std::array<W, N> one() { return std::array<W, N>{1}; }

constexpr static std::array<W, N> to_rep(const std::array<W, N>& x) { return x; }

constexpr static std::array<W, N> wide_to_rep(const std::array<W, 2 * N>& x) { return redc(x); }

constexpr static std::array<W, N> from_rep(const std::array<W, N>& z) { return z; }

private:
// Return (i*P-256) % 2**256
//
// Assumes i is small
constexpr static std::array<W, N> p256_mul_mod_256(W i) {
static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);

// For small i, multiples of P-256 have a simple structure so it's faster to
// compute the value directly vs a (constant time) table lookup

if constexpr(WordInfo<W>::bits == 32) {
auto r = P;
r[7] -= i;
r[6] += i;
r[3] += i;
r[0] -= i;
return r;
} else {
auto r = P;
const uint64_t i32 = static_cast<uint64_t>(i) << 32;
r[3] -= i32;
r[3] += i;
r[1] += i32;
r[0] -= i;
return r;
}
}
};

namespace secp256r1 {

// clang-format off
class Params final : public EllipticCurveParameters<
"FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFF",
"FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFC",
Expand All @@ -25,11 +175,17 @@ class Params final : public EllipticCurveParameters<
-10> {
};

class Curve final : public EllipticCurve<Params> {};
// clang-format on

}
#if (BOTAN_MP_WORD_BITS == 32)
// Secp256r1Rep works for 64 bit also, but is at best marginally faster at least
// on compilers/CPUs tested so far
class Curve final : public EllipticCurve<Params, Secp256r1Rep> {};
#else
class Curve final : public EllipticCurve<Params> {};
#endif

// clang-format on
} // namespace secp256r1

} // namespace

Expand Down

0 comments on commit 1b1b0eb

Please sign in to comment.