Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize Platform + Restructure #3

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
double buffering + prefetch
  • Loading branch information
AndreSlavescu committed Nov 14, 2023
commit c7711192d2ac8d86b9ed20ce95e00ed0cfb86fe7
111 changes: 69 additions & 42 deletions zkfc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,58 +2,75 @@

#define TILE_WIDTH 16


KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int colsA, int colsB) {
__shared__ Fr_t A_tile[TILE_WIDTH][TILE_WIDTH];
__shared__ Fr_t B_tile[TILE_WIDTH][TILE_WIDTH];
KERNEL void matrixMultiplyOptimized(Fr_t *A, Fr_t *B, Fr_t *C, int rowsA, int colsA, int colsB)
{
// Leverage double buffering
__shared__ Fr_t A_tiles[2][TILE_WIDTH][TILE_WIDTH];
__shared__ Fr_t B_tiles[2][TILE_WIDTH][TILE_WIDTH];

int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
int col = blockIdx.x * TILE_WIDTH + threadIdx.x;

Fr_t sum = blstrs__scalar__Scalar_ZERO;

// Loop over the tiles of A and B required to compute the block sub-matrix
for (int t = 0; t < (colsA - 1)/TILE_WIDTH + 1; ++t) {
#pragma unroll
for (int t = 0; t < (colsA - 1) / TILE_WIDTH + 1; ++t)
{
// buffer index
int buffer = t % 2;

// Load the matrices from device memory to shared memory; each thread loads
// one element of each matrix
if (row < rowsA && t*TILE_WIDTH + threadIdx.x < colsA) {
A_tile[threadIdx.y][threadIdx.x] = A[row*colsA + t*TILE_WIDTH + threadIdx.x];
} else {
A_tile[threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
if (row < rowsA && t * TILE_WIDTH + threadIdx.x < colsA)
{
// prefetch matrix A into shared mem
A_tiles[buffer][threadIdx.y][threadIdx.x] = __ldg(&A[row * colsA + t * TILE_WIDTH + threadIdx.x]);
}

if (t*TILE_WIDTH + threadIdx.y < colsA && col < colsB) {
B_tile[threadIdx.y][threadIdx.x] = B[(t*TILE_WIDTH + threadIdx.y)*colsB + col];
} else {
B_tile[threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
else
{
A_tiles[buffer][threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
}

if (t * TILE_WIDTH + threadIdx.y < colsA && col < colsB)
{
// prefetch matrix B into shared mem
B_tiles[buffer][threadIdx.y][threadIdx.x] = __ldg(&B[(t * TILE_WIDTH + threadIdx.y) * colsB + col]);
}
else
{
B_tiles[buffer][threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
}

// Synchronize to ensure all the data in shared memory is available
__syncthreads();

// Multiply the two matrices together;
for (int k = 0; k < TILE_WIDTH; ++k) {
sum = blstrs__scalar__Scalar_add(sum, blstrs__scalar__Scalar_mul(A_tile[threadIdx.y][k], B_tile[k][threadIdx.x]));
// multiply matrices
#pragma unroll
for (int k = 0; k < TILE_WIDTH; ++k)
{
Fr_t A_value = A_tiles[threadIdx.y][k];
Fr_t B_value = B_tiles[k][threadIdx.x];
sum = blstrs__scalar__Scalar_add(sum, blstrs__scalar__Scalar_mul(A_value, B_value));
}

// Synchronize to ensure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}

if (row < rowsA && col < colsB) {
C[row*colsB + col] = sum;
if (row < rowsA && col < colsB)
{
C[row * colsB + col] = sum;
}
}

// KERNEL void random_init(Fr_t* params, uint num_bits, uint n)
// {
// int tid = blockIdx.x * blockDim.x + threadIdx.x;
// curandState state;

// // Initialize the RNG state for this thread.
// curand_init(1234, tid, 0, &state);
// curand_init(1234, tid, 0, &state);

// if (tid < n) {
// params[tid] = {curand(&state) & ((1U << num_bits) - 1), 0, 0, 0, 0, 0, 0, 0};
// params[tid] = blstrs__scalar__Scalar_mont(blstrs__scalar__Scalar_sub(params[tid], {1U << (num_bits - 1), 0, 0, 0, 0, 0, 0, 0}));
Expand All @@ -69,53 +86,62 @@ DEVICE Fr_t float_to_Fr(float x)
bool negative = (sign_x < 0);
uint rounded_abs = static_cast<uint>(abs_x);

if (negative){
if (negative)
{
return blstrs__scalar__Scalar_sub({0, 0, 0, 0, 0, 0, 0, 0}, {rounded_abs, 0, 0, 0, 0, 0, 0, 0});
}
else {
else
{
return {rounded_abs, 0, 0, 0, 0, 0, 0, 0};
}
}

KERNEL void float_to_Fr_kernel(float* fs, Fr_t* frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size)
KERNEL void float_to_Fr_kernel(float *fs, Fr_t *frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
uint dim0 = tid / frs_window_size;
uint dim1 = tid % frs_window_size;
if (tid >= frs_num_window * frs_window_size) return;
if (dim0 < fs_num_window && dim1 < fs_window_size) frs[dim0 * frs_window_size + dim1] = float_to_Fr(fs[dim0 * fs_window_size + dim1]);
else frs[tid] = {0, 0, 0, 0, 0, 0, 0, 0};
if (tid >= frs_num_window * frs_window_size)
return;
if (dim0 < fs_num_window && dim1 < fs_window_size)
frs[dim0 * frs_window_size + dim1] = float_to_Fr(fs[dim0 * fs_window_size + dim1]);
else
frs[tid] = {0, 0, 0, 0, 0, 0, 0, 0};
}

zkFC zkFC::from_float_gpu_ptr (uint input_size, uint output_size, float* float_gpu_ptr, const Commitment& generators)
{
zkFC zkFC::from_float_gpu_ptr(uint input_size, uint output_size, float *float_gpu_ptr, const Commitment &generators)
{
uint rounded_input_size = 1 << ceilLog2(input_size);
uint rounded_output_size = 1 << ceilLog2(output_size);

FrTensor weights(rounded_input_size * rounded_output_size);
float_to_Fr_kernel<<<(rounded_input_size * rounded_output_size+FrNumThread-1)/FrNumThread,FrNumThread>>>(float_gpu_ptr, weights.gpu_data, input_size, rounded_input_size, output_size, rounded_output_size);
float_to_Fr_kernel<<<(rounded_input_size * rounded_output_size + FrNumThread - 1) / FrNumThread, FrNumThread>>>(float_gpu_ptr, weights.gpu_data, input_size, rounded_input_size, output_size, rounded_output_size);
cudaDeviceSynchronize();
// cout << "Loaded weight is: " << weights << endl;
return zkFC(rounded_input_size, rounded_output_size, weights.mont(), generators);
}

zkFC::zkFC(uint input_size, uint output_size, const FrTensor& t, const Commitment& c) : inputSize(input_size), outputSize(output_size), weights(t), com(c.commit(t)) {
if (t.size != input_size * output_size) throw std::runtime_error("Incompatible dimensions");
zkFC::zkFC(uint input_size, uint output_size, const FrTensor &t, const Commitment &c) : inputSize(input_size), outputSize(output_size), weights(t), com(c.commit(t))
{
if (t.size != input_size * output_size)
throw std::runtime_error("Incompatible dimensions");
}

FrTensor zkFC::load_float_gpu_input(uint batch_size, uint input_dim, float* input_ptr)
FrTensor zkFC::load_float_gpu_input(uint batch_size, uint input_dim, float *input_ptr)
{
uint rounded_batch_size = 1 << ceilLog2(batch_size);
uint rounded_input_dim = 1 << ceilLog2(input_dim);
FrTensor t(rounded_batch_size * rounded_input_dim);
float_to_Fr_kernel<<<(rounded_batch_size * rounded_input_dim+FrNumThread-1)/FrNumThread,FrNumThread>>>(input_ptr, t.gpu_data, batch_size, rounded_batch_size, input_dim, rounded_input_dim);
float_to_Fr_kernel<<<(rounded_batch_size * rounded_input_dim + FrNumThread - 1) / FrNumThread, FrNumThread>>>(input_ptr, t.gpu_data, batch_size, rounded_batch_size, input_dim, rounded_input_dim);
cudaDeviceSynchronize();
// cout << "Loaded input is: " << t << endl;
return t;
}

FrTensor zkFC::operator()(const FrTensor& X) const {
if (X.size % inputSize != 0) throw std::runtime_error("Incompatible dimensions");
FrTensor zkFC::operator()(const FrTensor &X) const
{
if (X.size % inputSize != 0)
throw std::runtime_error("Incompatible dimensions");
uint batchSize = X.size / inputSize;
dim3 blockSize(TILE_WIDTH, TILE_WIDTH);
dim3 gridSize((outputSize + blockSize.x - 1) / blockSize.x, (batchSize + blockSize.y - 1) / blockSize.y);
Expand All @@ -125,9 +151,11 @@ FrTensor zkFC::operator()(const FrTensor& X) const {
return out;
}

void zkFC::prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) const {
void zkFC::prove(const FrTensor &X, const FrTensor &Z, Commitment &generators) const
{
// cout << X.size << " " << inputSize << endl;
if (X.size % inputSize != 0) {
if (X.size % inputSize != 0)
{
throw std::runtime_error("Incompatible dimensions 1");
}
uint batchSize = X.size / inputSize;
Expand All @@ -143,4 +171,3 @@ void zkFC::prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) c
Z(u_Z);
generators.open(weights, com, concatenate<Fr_t>({u_out_dim, u_in_dim}));
}

30 changes: 15 additions & 15 deletions zkfc.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,42 +6,42 @@
#include <cstddef>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include "bls12-381.cuh" // adjust this to point to the blstrs header file
#include "bls12-381.cuh" // adjust this to point to the blstrs header file
#include "fr-tensor.cuh"
#include "proof.cuh"
#include "commitment.cuh"

#define TILE_WIDTH 16

class zkFC {
class zkFC
{
private:

FrTensor weights;
G1TensorJacobian com;

public:
const uint inputSize;
const uint outputSize;
//zkFC(uint input_size, uint output_size, uint num_bits, const Commitment& generators);
zkFC(uint input_size, uint output_size, const FrTensor& t, const Commitment& generators);
FrTensor operator()(const FrTensor& X) const;
void prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) const;
// zkFC(uint input_size, uint output_size, uint num_bits, const Commitment& generators);
zkFC(uint input_size, uint output_size, const FrTensor &t, const Commitment &generators);
FrTensor operator()(const FrTensor &X) const;
void prove(const FrTensor &X, const FrTensor &Z, Commitment &generators) const;

// static zkFC random_fc(uint input_size, uint output_size, uint num_bits, const Commitment& generators);
static zkFC from_float_gpu_ptr (uint input_size, uint output_size, float* float_gpu_ptr, const Commitment& generators);
static FrTensor load_float_gpu_input(uint batch_size, uint input_dim, float* input_ptr);
static zkFC from_float_gpu_ptr(uint input_size, uint output_size, float *float_gpu_ptr, const Commitment &generators);
static FrTensor load_float_gpu_input(uint batch_size, uint input_dim, float *input_ptr);
};

KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int colsA, int colsB);
KERNEL void matrixMultiplyOptimized(Fr_t *A, Fr_t *B, Fr_t *C, int rowsA, int colsA, int colsB);

// KERNEL void random_init(Fr_t* params, uint num_bits, uint n)
// {
// int tid = blockIdx.x * blockDim.x + threadIdx.x;
// curandState state;

// // Initialize the RNG state for this thread.
// curand_init(1234, tid, 0, &state);
// curand_init(1234, tid, 0, &state);

// if (tid < n) {
// params[tid] = {curand(&state) & ((1U << num_bits) - 1), 0, 0, 0, 0, 0, 0, 0};
// params[tid] = blstrs__scalar__Scalar_mont(blstrs__scalar__Scalar_sub(params[tid], {1U << (num_bits - 1), 0, 0, 0, 0, 0, 0, 0}));
Expand All @@ -50,6 +50,6 @@ KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int co

DEVICE Fr_t float_to_Fr(float x);

KERNEL void float_to_Fr_kernel(float* fs, Fr_t* frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size);
KERNEL void float_to_Fr_kernel(float *fs, Fr_t *frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size);

#endif // ZKFC_CUH
#endif // ZKFC_CUH