From 2f723c4afd814409f0f07fc672e60c05107876e6 Mon Sep 17 00:00:00 2001 From: Jacob Levinson Date: Sun, 21 Jul 2024 17:16:02 -0700 Subject: [PATCH 1/3] Adjusted regression Makefile to include new test --- tests/regression/Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/regression/Makefile b/tests/regression/Makefile index 56b63d1e1..6fa3237a1 100644 --- a/tests/regression/Makefile +++ b/tests/regression/Makefile @@ -15,6 +15,7 @@ all: $(MAKE) -C sgemmx $(MAKE) -C conv3x $(MAKE) -C sgemm2x + $(MAKE) -C stencil3d run-simx: $(MAKE) -C basic run-simx @@ -30,6 +31,7 @@ run-simx: $(MAKE) -C sgemmx run-simx $(MAKE) -C conv3x run-simx $(MAKE) -C sgemm2x run-simx + $(MAKE) -C stencil3d run-simx run-rtlsim: $(MAKE) -C basic run-rtlsim @@ -45,6 +47,7 @@ run-rtlsim: $(MAKE) -C sgemmx run-rtlsim $(MAKE) -C conv3x run-rtlsim $(MAKE) -C sgemm2x run-rtlsim + $(MAKE) -C stencil3d run-rtlsim run-opae: $(MAKE) -C basic run-opae @@ -60,6 +63,7 @@ run-opae: $(MAKE) -C sgemmx run-opae $(MAKE) -C conv3x run-opae $(MAKE) -C sgemm2x run-opae + $(MAKE) -C stencil3d run-opae clean: $(MAKE) -C basic clean @@ -74,4 +78,5 @@ clean: $(MAKE) -C vecaddx clean $(MAKE) -C sgemmx clean $(MAKE) -C conv3x clean - $(MAKE) -C sgemm2x clean \ No newline at end of file + $(MAKE) -C sgemm2x clean + $(MAKE) -C stencil3d clean \ No newline at end of file From cd94288e05a60f196b54146570d9bef08e7769c9 Mon Sep 17 00:00:00 2001 From: Jacob Levinson Date: Sun, 21 Jul 2024 17:24:35 -0700 Subject: [PATCH 2/3] Added all files for stencil3d regression test --- tests/regression/stencil3d/Makefile | 14 ++ tests/regression/stencil3d/common.h | 18 ++ tests/regression/stencil3d/kernel.cpp | 58 +++++ tests/regression/stencil3d/main.cpp | 328 ++++++++++++++++++++++++++ 4 files changed, 418 insertions(+) create mode 100644 tests/regression/stencil3d/Makefile create mode 100644 tests/regression/stencil3d/common.h create mode 100644 tests/regression/stencil3d/kernel.cpp create mode 100644 tests/regression/stencil3d/main.cpp diff --git a/tests/regression/stencil3d/Makefile b/tests/regression/stencil3d/Makefile new file mode 100644 index 000000000..c4aacdb94 --- /dev/null +++ b/tests/regression/stencil3d/Makefile @@ -0,0 +1,14 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := stencil3d + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n32-b2 # 32x32x32 matrix and block size of 2x2x2 + +include ../common.mk \ No newline at end of file diff --git a/tests/regression/stencil3d/common.h b/tests/regression/stencil3d/common.h new file mode 100644 index 000000000..2c4a8ea00 --- /dev/null +++ b/tests/regression/stencil3d/common.h @@ -0,0 +1,18 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#ifndef TYPE +#define TYPE float +#endif + +typedef struct +{ + uint32_t grid_dim[3]; + uint32_t block_dim[3]; + uint32_t size; + uint32_t block_size; + uint64_t A_addr; + uint64_t B_addr; +} kernel_arg_t; + +#endif \ No newline at end of file diff --git a/tests/regression/stencil3d/kernel.cpp b/tests/regression/stencil3d/kernel.cpp new file mode 100644 index 000000000..48e2468ab --- /dev/null +++ b/tests/regression/stencil3d/kernel.cpp @@ -0,0 +1,58 @@ +#include +#include "common.h" + +void kernel_body(kernel_arg_t *arg) +{ + auto A = reinterpret_cast(arg->A_addr); + auto B = reinterpret_cast(arg->B_addr); + auto size = arg->size; // Assuming 'size' now represents one dimension of a cubic space. + + // Calculate global column, row, and depth indices using both block and thread indices + int col = blockIdx.x * blockDim.x + threadIdx.x; + int row = blockIdx.y * blockDim.y + threadIdx.y; + int dep = blockIdx.z * blockDim.z + threadIdx.z; + + TYPE sum = 0; + int count = 0; + + // Stencil kernel size is assumed to be 3x3x3 + for (int dz = -1; dz <= 1; ++dz) + { + for (int dy = -1; dy <= 1; ++dy) + { + for (int dx = -1; dx <= 1; ++dx) + { + // Compute the neighbor's index, handling boundary conditions manually + int nz = dep + dz; + int ny = row + dy; + int nx = col + dx; + + // Clamp the indices to be within the boundary of the array + if (nz < 0) {nz = 0;} + else if (nz >= size){ + nz = size - 1;} + if (ny < 0) { + ny = 0; } + else if (ny >= size){ + ny = size - 1;} + if (nx < 0) { + nx = 0;} + else if (nx >= size){ + nx = size - 1;} + + // Add the neighbor's value to sum + sum += A[nz * size * size + ny * size + nx]; + count++; + } + } + } + + // Compute the average of the sum of neighbors and write to the output array + B[dep * size * size + row * size + col] = sum / count; +} + +int main() +{ + auto arg = (kernel_arg_t *)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(3, arg->grid_dim, arg->block_dim, (vx_kernel_func_cb)kernel_body, arg); +} \ No newline at end of file diff --git a/tests/regression/stencil3d/main.cpp b/tests/regression/stencil3d/main.cpp new file mode 100644 index 000000000..a47f94710 --- /dev/null +++ b/tests/regression/stencil3d/main.cpp @@ -0,0 +1,328 @@ + +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do \ + { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +template +class Comparator +{ +}; + +template <> +class Comparator +{ +public: + static const char *type_str() + { + return "integer"; + } + static int generate() + { + return rand(); + } + static bool compare(int a, int b, int index, int errors) + { + if (a != b) + { + if (errors < 100) + { + printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + } + return false; + } + return true; + } +}; + +template <> +class Comparator +{ +private: + union Float_t + { + float f; + int i; + }; + +public: + static const char *type_str() + { + return "float"; + } + static float generate() + { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) + { + union fi_t + { + float f; + int32_t i; + }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) + { + if (errors < 100) + { + printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + } + return false; + } + return true; + } +}; + +static void stencil_cpu(TYPE *out, const TYPE *in, uint32_t width, uint32_t height, uint32_t depth) +{ + // We'll need to handle boundary conditions. Let's assume we use boundary replication. + for (uint32_t z = 0; z < depth; z++) + { + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + TYPE sum = 0; + int count = 0; + + // Iterate over the neighborhood + for (int dz = -1; dz <= 1; dz++) + { + for (int dy = -1; dy <= 1; dy++) + { + for (int dx = -1; dx <= 1; dx++) + { + // Compute the neighbor's index + int nx = (int)x + dx; + int ny = (int)y + dy; + int nz = (int)z + dz; + + // Check bounds and replicate the boundary values + if (nx < 0) + { + nx = 0; + } + else if (nx >= (int)width) + { + nx = width - 1; + } + if (ny < 0) + { + ny = 0; + } + else if (ny >= (int)height) + { + ny = height - 1; + } + if (nz < 0) + { + nz = 0; + } + else if (nz >= (int)depth) + { + nz = depth - 1; + } + + // Sum up the values + sum += in[nz * width * height + ny * width + nx]; + count++; + } + } + } + + // Write the averaged value to the output array + out[z * width * height + y * width + x] = sum / count; + } + } + } +} + +const char *kernel_file = "kernel.vxbin"; +uint32_t size = 64; +uint32_t block_size = 2; + +vx_device_h device = nullptr; +vx_buffer_h A_buffer = nullptr; +vx_buffer_h B_buffer = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +kernel_arg_t kernel_arg = {}; + +static void show_usage() +{ + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n matrix_size] [-b:block_size] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) +{ + int c; + while ((c = getopt(argc, argv, "n:t:k:h?")) != -1) + { + switch (c) + { + case 'n': + size = atoi(optarg); + break; + case 'b': + block_size = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': + { + show_usage(); + exit(0); + } + break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() +{ + if (device) + { + vx_mem_free(A_buffer); + vx_mem_free(B_buffer); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +int main(int argc, char *argv[]) +{ + // parse command arguments + parse_args(argc, argv); + + if ((size / block_size) * block_size != size) + { + printf("Error: matrix size %d must be a multiple of block size %d\n", size, block_size); + return -1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint32_t size_cubed = size * size * size; + uint32_t buf_size = size_cubed * sizeof(TYPE); + + std::cout << "data type: " << Comparator::type_str() << std::endl; + std::cout << "matrix size: " << size << "x" << size << std::endl; + std::cout << "block size: " << block_size << "x" << block_size << std::endl; + + kernel_arg.grid_dim[0] = size / block_size; + kernel_arg.grid_dim[1] = size / block_size; + kernel_arg.grid_dim[2] = size / block_size; + kernel_arg.block_dim[0] = block_size; + kernel_arg.block_dim[1] = block_size; + kernel_arg.block_dim[2] = block_size; + kernel_arg.size = size; + kernel_arg.block_size = block_size; + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &A_buffer)); + RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.A_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &B_buffer)); + RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.B_addr)); + + std::cout << "A_addr=0x" << std::hex << kernel_arg.A_addr << std::endl; + std::cout << "B_addr=0x" << std::hex << kernel_arg.B_addr << std::endl; + + // allocate host buffers + std::cout << "allocate host buffers" << std::endl; + std::vector h_A(size_cubed); + std::vector h_B(size_cubed); + + // generate source data + for (uint32_t i = 0; i < size_cubed; ++i) + { + h_A[i] = Comparator::generate(); + } + + // upload source buffer0 + std::cout << "upload source buffer0" << std::endl; + RT_CHECK(vx_copy_to_dev(A_buffer, h_A.data(), 0, buf_size)); + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer)); + + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(h_B.data(), B_buffer, 0, buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + int errors = 0; + { + std::vector h_ref(size_cubed); + stencil_cpu(h_ref.data(), h_A.data(), size, size, size); + + for (uint32_t i = 0; i < h_ref.size(); ++i) + { + if (!Comparator::compare(h_B[i], h_ref[i], i, errors)) + { + ++errors; + } + } + } + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + if (errors != 0) + { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return errors; + } + + std::cout << "PASSED!" << std::endl; + + return 0; +} From b489cc7abdcb45dfc439237202585c45421c7044 Mon Sep 17 00:00:00 2001 From: Jacob Levinson Date: Sun, 21 Jul 2024 17:25:12 -0700 Subject: [PATCH 3/3] Added *.cache to gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d1571b535..039456040 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /build* -/.vscode \ No newline at end of file +/.vscode +*.cache \ No newline at end of file