Skip to content

Commit

Permalink
Merge pull request #426 from Chia-Network/develop
Browse files Browse the repository at this point in the history
Version 3.1.0

- Add CUDA disk-hybrid mode with 128G of system DRAM.
- Add integrate plot checker into CUDA plotter.
- Exposes `--no-direct-io` to disable direct-IO to the output plot directory.
- Fix some related issues on Windows.
- Fix bug where some plots overflowed slice buffers.
- Fix build issues and other trivial issues.
- Expose experimental/WIP CUDA 16G disk -hybrid mode on Linux.
- Update README with CUDA and compression information.
haorldbchi authored Oct 3, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents 02a8e68 + 7b25480 commit e9836f8
Showing 79 changed files with 5,724 additions and 2,148 deletions.
8 changes: 0 additions & 8 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

43 changes: 0 additions & 43 deletions .vscode/c_cpp_properties.json

This file was deleted.

63 changes: 46 additions & 17 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -131,19 +131,25 @@
"preLaunchTask" : "build_cuda_debug",

"program": "${workspaceFolder}/build/bladebit_cuda",

// "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6",
// "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73", // Yes overflow
// "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb",

"args":
"-w -n 1 -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --check 100 --check-threshold 2 /home/harold/plot",

// "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot /home/harold/plot",
// "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot",
// "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
"-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",

"args":
// "-w --compress 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp",
"-w --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot",

"windows": {
"type": "cppvsdbg",
"program": "${workspaceFolder}/build/Debug/bladebit_cuda.exe",
"args": "--benchmark --compress 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/"
// "args": "--benchmark -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot D:/"
"args": "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot -t2 D:/chia_test_plots D:/chia_test_plots",
}
},

@@ -236,7 +242,7 @@

{
"name" : "Tests",

"type" : "cppdbg",
"osx": {
"MIMode": "lldb",
@@ -245,7 +251,7 @@
"stopAtEntry" : false,
"cwd" : "${workspaceFolder}",
"preLaunchTask" : "build_tests_debug",
"console" : "internalConsole",
// "console" : "internalConsole",

"program": "${workspaceRoot}/build/tests",

@@ -260,6 +266,8 @@
// { "name": "bb_plot" , "value": "/home/harold/plot/tmp/plot-k32-c06-2023-02-14-21-43-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot" },
{ "name": "bb_clevel" , "value": "1" },
{ "name": "bb_end_clevel" , "value": "1" },

{ "name": "bb_queue_path" , "value": "/home/ubuntu/plot" },
],

"args": [
@@ -273,7 +281,10 @@
// "line-point-deltas"
// "compressed-plot-proof"
// "compressed-plot-qualities"
"macos-threads"
// "macos-threads"
// "disk-slices"
// "disk-buckets"
"[disk-queue]"
]
}

@@ -285,10 +296,16 @@
"stopAtEntry" : false,
"cwd" : "${workspaceFolder}",
"preLaunchTask" : "build_debug",
"console" : "internalConsole",

"program": "${workspaceFolder}/build/bladebit",

// "program": "${workspaceFolder}/build/bladebit_cuda",

"linux": {
"MIMode": "gdb",
"miDebuggerPath": "/usr/bin/gdb",
"program": "${workspaceFolder}/build/bladebit"
},

"windows": {
"type" : "cppvsdbg",
"program": "${workspaceFolder}/build/debug/bladebit.exe"
@@ -301,6 +318,11 @@
// "-t", "48",
// "-t", "1",

// "validate", "--f7", "2",
// "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
// "/home/harold/plot/plot-k32-c01-2023-07-19-00-29-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
// "/home/harold/plot/plot-k32-c01-2023-08-03-04-57-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"

// "-t", "1", "validate", "--f7", "324", "~/plot/tmp/plot-k32-c01-2023-02-13-22-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
// "validate", "--f7", "7", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
// "validate", "--cuda", "--f7", "4", "~/plot/tmp/plot-k32-c07-2023-04-13-16-08-330fbf677f78641061c93312c1a7ffa28138739b69975f3b874df6acc3e76378.plot",
@@ -322,8 +344,8 @@
// // "/home/harold/plot/tmp/plot-k32-c04-2023-01-31-23-15-5cfc42dfaa5613da0b425994c2427a2ba4a8efcfb49e7844e93c0854baf09863.plot"

// Simulation
"-t", "1", "simulate", "--seed", "b8e9ec6bc179ae6ba5f5c3483f7501db32879efa84b62001d27601a540dca5ff",
"-p", "16", "-n", "1", "--power", "45", "--size", "4PB", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
// "-t", "1", "simulate", "--seed", "b8e9ec6bc179ae6ba5f5c3483f7501db32879efa84b62001d27601a540dca5ff",
// "-p", "16", "-n", "1", "--power", "45", "--size", "4PB", "~/plot/tmp/plot-k32-c01-2023-03-09-14-07-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
// "-t", "30", "simulate", "-p", "2", "-n", "600", "~/plot/tmp/plot-k32-c07-2023-03-16-11-49-7732c75d9f3b5ad1fc804bb7429121e334bd4f25f9bbbb76ef0370b5a0e80aae.plot"

// "-m",
@@ -335,11 +357,18 @@
// "--f7", "3983284117", "/home/harito/plot/tmp/gpu_1.plot",

/// Compare
// "plotcmp",
// "/home/harito/plot/tmp/gpu_1.plot.old",
// "/home/harold/plot-tmpfs/gpu_1.plot",
// "/home/harito/plot/tmp/gpu_1.plot",
// "/home/harito/plot/tmp/plot-k32-2022-11-21-05-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
"plotcmp",
"/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
"/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",

// "/home/harold/plot/plot-k32-c01-2023-08-03-22-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
// "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"

// Check
// "check",
// "-n", "100", "--seed", "dc471c4d905ba3a65c6cecb46d97b132c0c98f51d416db5ec5cbdbe95ef2832f",
// "/home/harold/plot/plot-k32-c01-2023-07-19-00-29-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
// "/home/harold/plot/jm.plot"
]
},

27 changes: 22 additions & 5 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -4,16 +4,16 @@
"nominmax"
],
"files.associations": {
"*.sd": "yaml",
"*.userprefs": "xml",
"*.make": "makefile",
"Fastfile": "ruby",
"*.plist": "xml",
"*.sd": "yaml",
"*.json": "jsonc",
"*.ir": "llvm",
"*.qs": "javascript",
"*.ac": "shellscript",
"player": "json",
"*.userprefs": "xml",
"*.make": "makefile",
"memory": "cpp",
"cstddef": "cpp",
"string": "cpp",
@@ -113,7 +113,18 @@
"filesystem": "cpp",
"__bits": "cpp",
"csignal": "cpp",
"cfenv": "cpp"
"cfenv": "cpp",
"ranges": "cpp",
"xhash": "cpp",
"xmemory": "cpp",
"xstddef": "cpp",
"xstring": "cpp",
"xtr1common": "cpp",
"xtree": "cpp",
"xutility": "cpp",
"__assert": "cpp",
"*.inc": "cpp",
"xiosbase": "cpp"
},
"cSpell.words": [
"Ryzen"
@@ -124,7 +135,13 @@
"cmake.preferredGenerators": [
"Unix Makefiles",
"Visual Studio 17 2022"
]
],
// "cmake.buildArgs": [],
"cmake.configureSettings": {
"BB_ENABLE_TESTS": "ON",
"BB_CUDA_USE_NATIVE": "ON"
},
"C_Cpp.dimInactiveRegions": false,
// "cmake.generator": "Unix Makefiles"
// "cmake.generator": "Visual Studio 17 2022"

15 changes: 15 additions & 0 deletions Bladebit.cmake
Original file line number Diff line number Diff line change
@@ -227,6 +227,8 @@ set(src_bladebit
src/plotting/PlotWriter.cpp
src/plotting/PlotWriter.h
src/plotting/Tables.h
src/plotting/BufferChain.h
src/plotting/BufferChain.cpp

src/plotting/f1/F1Gen.h
src/plotting/f1/F1Gen.cpp
@@ -258,6 +260,7 @@ set(src_bladebit
src/tools/PlotReader.cpp
src/tools/PlotReader.h
src/tools/PlotValidator.cpp
src/tools/PlotChecker.cpp

src/util/Array.h
src/util/Array.inl
@@ -289,6 +292,18 @@ set(src_bladebit
src/harvesting/GreenReaper.h
src/harvesting/GreenReaperInternal.h
src/harvesting/Thresher.h

src/plotting/DiskQueue.h
src/plotting/DiskQueue.cpp
src/plotting/DiskBuffer.h
src/plotting/DiskBuffer.cpp
src/plotting/DiskBucketBuffer.h
src/plotting/DiskBucketBuffer.cpp
src/plotting/DiskBufferBase.h
src/plotting/DiskBufferBase.cpp

src/util/MPMCQueue.h
src/util/CommandQueue.h
)

target_sources(bladebit_core PUBLIC ${src_bladebit})
5 changes: 4 additions & 1 deletion BladebitCUDA.cmake
Original file line number Diff line number Diff line change
@@ -22,6 +22,9 @@ add_executable(bladebit_cuda
cuda/CudaPlotUtil.cu
cuda/GpuStreams.h
cuda/GpuStreams.cu
cuda/GpuDownloadStream.cu
cuda/GpuQueue.h
cuda/GpuQueue.cu

# Harvester
cuda/harvesting/CudaThresher.cu
@@ -42,7 +45,7 @@ target_compile_options(bladebit_cuda PRIVATE
>

$<${is_cuda_debug}:
-G
# -G
>
)

16 changes: 9 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 3.19 FATAL_ERROR)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

set(CMAKE_CONFIGURATION_TYPES Release Debug)
@@ -9,15 +10,15 @@ if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release"
CACHE STRING "Possible values are: Release, Debug"
FORCE
)
)
endif()

# Allows for CMAKE_MSVC_RUNTIME_LIBRARY
if(POLICY CMP0091)
cmake_policy(SET CMP0091 NEW)
endif()

set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "macOS minimum supported version.")
set(CMAKE_OSX_DEPLOYMENT_TARGET "10.16" CACHE STRING "macOS minimum supported version.")
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>" CACHE STRING "MSVC Runtime Library")

project(bladebit LANGUAGES C CXX ASM)
@@ -83,10 +84,10 @@ endif()
# NOTE: These are mostly sandbox test environment, not proper tests
option(BB_ENABLE_TESTS "Enable tests." OFF)
option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF)
option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON)
option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." OFF)
option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF)
option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF)

option(BB_CUDA_USE_NATIVE "Only build the native CUDA architecture when in release mode." OFF)

#
# Dependencies
@@ -103,7 +104,7 @@ if(NOT ${BB_HARVESTER_ONLY})
GIT_REPOSITORY https://github.com/Chia-Network/bls-signatures.git
GIT_TAG 2.0.2
EXCLUDE_FROM_ALL ${BB_IS_DEPENDENCY}
)
)

set(BUILD_BLS_PYTHON_BINDINGS "0" CACHE STRING "0")
set(BUILD_BLS_TESTS "0" CACHE STRING "")
@@ -130,6 +131,7 @@ set(is_x86 $<OR:$<STREQUAL:${CMAKE_HOST_SYSTEM_PROCESSOR},AMD64>,$<STREQUAL:${CM
set(is_arm $<OR:$<STREQUAL:${CMAKE_HOST_SYSTEM_PROCESSOR},arm64>,$<STREQUAL:${CMAKE_HOST_SYSTEM_PROCESSOR},aarch64>>)
set(is_msvc_c_cpp $<AND:${is_c_cpp},$<CXX_COMPILER_ID:MSVC>>)


if(CUDAToolkit_FOUND AND NOT ${NO_CUDA_HARVESTER})
set(have_cuda $<BOOL:1>)
else()
@@ -143,7 +145,7 @@ endif()
include(Config.cmake)

if(NOT ${BB_HARVESTER_ONLY})
if(NOT BB_IS_DEPENDENCY AND (NOT BB_NO_EMBED_VERSION))
if((NOT BB_IS_DEPENDENCY) AND (NOT BB_NO_EMBED_VERSION))
include(cmake_modules/EmbedVersion.cmake)
endif()

118 changes: 59 additions & 59 deletions Config.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# Base interface configuration project
add_library(bladebit_config INTERFACE)

target_include_directories(bladebit_config INTERFACE
${INCLUDE_DIRECTORIES}
${CMAKE_CURRENT_SOURCE_DIR}/src
)

target_compile_definitions(bladebit_config INTERFACE
$<${is_release}:
_NDEBUG=1
@@ -22,32 +27,34 @@ target_compile_definitions(bladebit_config INTERFACE

target_compile_options(bladebit_config INTERFACE

# GCC or Clang
$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:
-Wall
-Wno-comment
-Wno-unknown-pragmas
-g

$<${is_release}:
-O3
$<${is_c_cpp}:
# GCC or Clang
$<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:
-Wall
-Wno-comment
-Wno-unknown-pragmas
-g

$<${is_release}:
-O3
>

$<${is_debug}:
-O0
>
>

$<${is_debug}:
-O0
# GCC
$<$<CXX_COMPILER_ID:GNU>:
-fmax-errors=5
>
>

# GCC
$<$<CXX_COMPILER_ID:GNU>:
-fmax-errors=5
>

# Clang
$<$<CXX_COMPILER_ID:Clang,AppleClang>:
-ferror-limit=5
-fdeclspec
-Wno-empty-body
# Clang
$<$<CXX_COMPILER_ID:Clang,AppleClang>:
-ferror-limit=5
-fdeclspec
-Wno-empty-body
>
>

# MSVC
@@ -129,43 +136,36 @@ cmake_policy(SET CMP0105 NEW)
set(cuda_archs

$<${is_cuda_release}:
## Maxwell
## Tesla/Quadro M series
-gencode=arch=compute_50,code=sm_50
## Quadro M6000 , GeForce 900, GTX-970, GTX-980, GTX Titan X
-gencode=arch=compute_52,code=sm_52
## Tegra (Jetson) TX1 / Tegra X1, Drive CX, Drive PX, Jetson Nano
-gencode=arch=compute_53,code=sm_53
## Pascal
## GeForce 1000 series
-gencode=arch=compute_60,code=sm_60
## GeForce GTX 1050Ti, GTX 1060, GTX 1070, GTX 1080
-gencode=arch=compute_61,code=sm_61
## Drive Xavier, Jetson AGX Xavier, Jetson Xavier NX
-gencode=arch=compute_62,code=sm_62
## Volta
## GV100, Tesla V100, Titan V
-gencode=arch=compute_70,code=sm_70
## Tesla V100
-gencode=arch=compute_72,code=sm_72
## Turing
-gencode=arch=compute_75,code=sm_75
## Ampere
## NVIDIA A100, DGX-A100
-gencode=arch=compute_80,code=sm_80
## GeForce RTX 3000 series, NVIDIA A100
-gencode=arch=compute_86,code=sm_86
## Jetson Orin
-gencode=arch=compute_87,code=sm_87
## Lovelace
## NVIDIA GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40
-gencode=arch=compute_89,code=sm_89
## Future proofing
-gencode=arch=compute_89,code=compute_89
## Hopper
## NVIDIA H100 (GH100)
# -gencode=arch=compute_90,code=sm_90
# -gencode=arch=compute_90a,code=sm_90a
$<$<BOOL:${BB_CUDA_USE_NATIVE}>:
-arch=native
>

$<$<NOT:$<BOOL:${BB_CUDA_USE_NATIVE}>>:

# Maxwell
-gencode=arch=compute_50,code=sm_50 # Tesla/Quadro M series
-gencode=arch=compute_52,code=sm_52 # Quadro M6000 , GeForce 900, GTX-970, GTX-980, GTX Titan X
-gencode=arch=compute_53,code=sm_53 # Tegra (Jetson) TX1 / Tegra X1, Drive CX, Drive PX, Jetson Nano

# Pascal
-gencode=arch=compute_60,code=sm_60 # GeForce 1000 series
-gencode=arch=compute_61,code=sm_61 # GeForce GTX 1050Ti, GTX 1060, GTX 1070, GTX 1080
-gencode=arch=compute_62,code=sm_62 # Drive Xavier, Jetson AGX Xavier, Jetson Xavier NX

# Volta
-gencode=arch=compute_70,code=sm_70 # GV100, Tesla V100, Titan V
-gencode=arch=compute_72,code=sm_72 # Tesla V100
-gencode=arch=compute_75,code=sm_75 # Turing

# Ampere
-gencode=arch=compute_80,code=sm_80 # NVIDIA A100, DGX-A100
-gencode=arch=compute_86,code=sm_86 # GeForce RTX 3000 series, NVIDIA A100
-gencode=arch=compute_87,code=sm_87 # Jetson Orin

# Lovelace
-gencode=arch=compute_89,code=sm_89 # NVIDIA GeForce RTX 4090, RTX 4080, RTX 6000, Tesla L40
-gencode=arch=compute_89,code=compute_89 # Future proofing
>
>

$<${is_cuda_debug}:
12 changes: 9 additions & 3 deletions Harvester.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
if(NOT ${BB_HARVESTER_STATIC})
add_library(bladebit_harvester SHARED)
add_library(bladebit_harvester SHARED src/harvesting/HarvesterDummy.cpp)
else()
add_library(bladebit_harvester STATIC)
endif()
@@ -82,9 +82,15 @@ target_sources(bladebit_harvester PRIVATE
cuda/CudaF1.cu
cuda/CudaMatch.cu
cuda/CudaPlotUtil.cu
cuda/GpuQueue.cu

# TODO: Remove this, ought not be needed in harvester
# TODO: Does this have to be here?
cuda/GpuStreams.cu
cuda/GpuDownloadStream.cu
src/plotting/DiskBuffer.cpp
src/plotting/DiskBucketBuffer.cpp
src/plotting/DiskBufferBase.cpp
src/plotting/DiskQueue.cpp
>

$<$<NOT:${have_cuda}>:
@@ -159,7 +165,7 @@ if(CUDAToolkit_FOUND)
CUDA_RUNTIME_LIBRARY Static
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
# CUDA_ARCHITECTURES OFF
CUDA_ARCHITECTURES OFF
)
endif()

103 changes: 94 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,71 @@
# BladeBit Chia Plotter
# Bladebit Chia Plotter

[![Release Builds](https://github.com/Chia-Network/bladebit/actions/workflows/build-release.yml/badge.svg?branch=master&event=push)](https://github.com/Chia-Network/bladebit/actions/workflows/build-release.yml)

A high-performance **k32-only**, Chia (XCH) plotter supporting in-RAM and disk-based plotting.
A high-performance **k32-only**, Chia (XCH) plotter.

Bladebit supports 3 plotting modes:
- Fully In-RAM (no drives required), CPU-based mode.
- GPU (CUDA-based) mode. Both fully in-RAM or disk-hybrid mode.
- Disk-based mode

## Usage
Run `bladebit --help` to see general help. For command-specific help, use `bladebit help <command_name>`.

## Requirements

**CUDA**

An NVIDIA GPU is required for this mode. This mode is exposed via the `cudaplot` command in a separate executable "bladebit_cuda". This mode has mainly been tested on consumer cards from the **10xx** series and up.

| Mode | OS | DRAM | VRAM | CUDA capability
|--------------------------------|----------------|------|------|----------------
| In-RAM | Linux, Windows | 256G | 8G | 5.2 and up
| Disk-hybrid 128G | Linux, Windows | 128G | 8G | 5.2 and up
| Disk-hybrid 16G (WIP) | Linux | 16G | 8G | 5.2 and up

> *NOTE: 16G mode currently a work in progress and at this stage it only works in Linux and direct I/O is unavailable in this mode.*

**CPU RAM-Only**

Available on Linux, Windows and macOS.
Requires at least **416G** of system DRAM.


**Disk**

Available on Linux, Windows and macOS.

A minimum of **4 GiB of RAM** is required, with lower bucket counts requiring up to 12 GiB of RAM. Roughly **480 GiB of disk space** is required in the default mode, or around **390 GiB of disk space** with `--alternate` mode enabled.

The exact amounts of RAM and disk space required may vary slightly depending on the system's page size and the target disk file system block size (block-alignment is required for direct I/O).

SSDs are highly recommended for disk-based plotting.


## Compressed Plots

Compressed plots are supported in CUDA mode and in RAM-only mode. CPU Disk-based mode does **NOT** currently support compressed plots.

Compressed plots are currently supported for compression levels from **C1** to **C7**. Note that bladebit compression levels are not compatible with other plotter compression levels. These compression levels are based on the *number of bits dropped from an entry excluding the minimum bits required to fully drop a table*. At `k=32` a the first table is fully excluded from the plot at 16 bits dropped.

> *NOTE: Although higher compression levels are available, support for farming them has not been currently implemented and are therefore disabled. They will be implemented in the future.*
Compression levels are currently roughly equivalent to the following plot sizes.

| Compression Level | Plot Size
|-------------------|-------------
| C1 | 87.5 GiB
| C2 | 86.0 GiB
| C3 | 84.4 GiB
| C4 | 82.8 GiB
| C5 | 81.2 GiB
| C6 | 79.6 GiB
| C7 | 78.0 GiB

These might be optimized in the future with further compression optimizations.


## Requirements

@@ -39,7 +102,7 @@ SSDs are highly recommended for disk-based plotting.


## Prerequisites
Linux, Windows and MacOS (both intel and ARM (Apple Silicon)) are supported.
Linux, Windows and macOS (both Intel and ARM) are supported.


### Linux
@@ -83,8 +146,12 @@ cmake --build . --target bladebit --config Release
The resulting binary will be found under the `build/` directory.
On Windows it will be under `build/Release/`.

For **bladebit_cuda**, the CUDA toolkit must be installed. The target name is `bladebit_cuda`.

For simplicity the `build.sh` or `build-cuda.sh` scripts can be used to build. On Windows this requires gitbash or similar bash-based shell to run.

## Usage
Run **bladebit** with the `-h` for complete usage and command line options:
Run **bladebit** (or **bladebit_cuda**) with the `-h` for complete usage and command line options:

```bash
# Linux & macOS
@@ -93,18 +160,33 @@ build/bladebit -h
# Windows
build/Release/bladebit.exe -h
```
The bladebit CLI uses the format `bladebit <GLOBAL_OPTIONS> <command> <COMMAND_OPTIONS>`.


The bladebit CLI uses the format `bladebit <GLOBAL_OPTIONS> <sub_command> <COMMAND_OPTIONS>`.

Use the aforementioned `-h` parameter to get the full list of sub-commands and `GLOBAL_OPTIONS`.
The `sub_command`-specific `COMMAND_OPTIONS` can be obtained by using the `help` sub command with the desired command as the parameter:
Use the aforementioned `-h` parameter to get the full list of commands and `GLOBAL_OPTIONS`.
The `command`-specific `COMMAND_OPTIONS` can be obtained by using the `help` sub command with the desired command as the parameter:

```bash
bladebit help cudaplot
bladebit help ramplot
bladebit help diskplot
```

### CUDA
Basic `cudaplot` usage:
```bash
# OG plots
./bladebit_cuda -f <farmer_public_key> -p <pool_public_key> cudaplot <output_directory>

# Portable plots
./bladebit_cuda -f <farmer_public_key> -c <pool_contract_address> cudaplot <output_directory>

# Compressed plots
./bladebit_cuda -z <copression_level> -f <farmer_public_key> -c <pool_contract_address> cudaplot <output_directory>

# 128G disk-hybrid mode
./bladebit_cuda -z <copression_level> -f <farmer_public_key> -c <pool_contract_address> cudaplot --disk-128 -t1 <temp_dir> <output_directory>
```

### In-RAM
Basic `ramplot` usage:
```bash
@@ -113,6 +195,9 @@ Basic `ramplot` usage:

# Portable plots
./bladebit -f <farmer_public_key> -c <pool_contract_address> ramplot <output_directory>

# Compressed plots
./bladebit -z <copression_level> -f <farmer_public_key> -c <pool_contract_address> ramplot <output_directory>
```

### Disk-Based
9 changes: 7 additions & 2 deletions Tests.cmake
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
include(cmake_modules/FindCatch2.cmake)

add_executable(tests ${src_bladebit})
add_executable(tests ${src_bladebit}
cuda/harvesting/CudaThresherDummy.cpp
tests/TestUtil.h
tests/TestDiskQueue.cpp
)

target_compile_definitions(tests PRIVATE
BB_TEST_MODE=1
)
target_link_libraries(tests PRIVATE bladebit_config Catch2::Catch2WithMain)
target_link_libraries(tests PRIVATE bladebit_config bladebit_core Catch2::Catch2WithMain)

set_target_properties(tests PROPERTIES
EXCLUDE_FROM_ALL ON
3 changes: 2 additions & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
3.0.0
3.1.0

11 changes: 11 additions & 0 deletions build-cuda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -e
_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
cd $_dir

build_dir=build-release
mkdir -p ${build_dir}
cd ${build_dir}

cmake .. -DCMAKE_BUILD_TYPE=Release
cmake --build . --target bladebit_cuda --config Release --clean-first -j24
27 changes: 18 additions & 9 deletions cmake_modules/EmbedVersion.cmake
Original file line number Diff line number Diff line change
@@ -2,18 +2,25 @@
if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
message("Embedding local build version")

set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")

set(cmd_ver bash)
set(cmd_shell bash)
set(cmd_ext sh)
if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
set(cmd_ver bash.exe)

find_program(bash_path NAMES bash.exe NO_CACHE)

if(${bash_path} MATCHES "-NOTFOUND")
set(cmd_shell powershell)
set(cmd_ext ps1)
else()
set(cmd_shell "${bash_path}")
endif()
endif()

execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} major OUTPUT_VARIABLE bb_ver_maj WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} minor OUTPUT_VARIABLE bb_ver_min WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} revision OUTPUT_VARIABLE bb_ver_rev WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} suffix OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} commit OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)

# Remove trailing whitespace incurred in windows gitbash
string(STRIP "${bb_ver_maj}" bb_ver_maj)
@@ -39,3 +46,5 @@ if(NOT DEFINED ENV{CI})
add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}")
add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}")
endif()

set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
11 changes: 6 additions & 5 deletions cuda/CudaPlotConfig.h
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@
#define BBCU_TABLE_ENTRY_COUNT (1ull<<32)
#define BBCU_BUCKET_ENTRY_COUNT (BBCU_TABLE_ENTRY_COUNT/BBCU_BUCKET_COUNT)
//#define BBCU_XTRA_ENTRIES_PER_SLICE (1024u*64u)
#define BBCU_XTRA_ENTRIES_PER_SLICE (4096u*1u)
#define BBCU_XTRA_ENTRIES_PER_SLICE (4096+1024)
#define BBCU_MAX_SLICE_ENTRY_COUNT ((BBCU_BUCKET_ENTRY_COUNT/BBCU_BUCKET_COUNT)+BBCU_XTRA_ENTRIES_PER_SLICE)
#define BBCU_BUCKET_ALLOC_ENTRY_COUNT (BBCU_MAX_SLICE_ENTRY_COUNT*BBCU_BUCKET_COUNT)
#define BBCU_TABLE_ALLOC_ENTRY_COUNT (((uint64)BBCU_BUCKET_ALLOC_ENTRY_COUNT)*BBCU_BUCKET_COUNT)
@@ -42,12 +42,12 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
#ifdef _WIN32
#define DBG_BBCU_DBG_DIR "D:/dbg/cuda/"
#else
// #define DBG_BBCU_DBG_DIR "/home/harold/plot/dbg/cuda/"
#define DBG_BBCU_DBG_DIR "/home/harito/plot/dbg/cuda/"
#define DBG_BBCU_DBG_DIR "/home/harold/plotdisk/dbg/cuda/"
// #define DBG_BBCU_DBG_DIR "/home/harito/plots/dbg/cuda/"
#endif
// #define DBG_BBCU_REF_DIR "/home/harold/plot/ref/"
// #define DBG_BBCU_REF_DIR "/home/harold/plots/ref/"



// #define BBCU_DBG_SKIP_PHASE_1 1 // Skip phase 1 and load pairs from disk
// #define BBCU_DBG_SKIP_PHASE_2 1 // Skip phase 1 and 2 and load pairs and marks from disk

@@ -60,6 +60,7 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
// #define DBG_BBCU_P2_WRITE_MARKS 1

// #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
// #define DBG_BBCU_KEEP_TEMP_FILES 1


#define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) )
82 changes: 67 additions & 15 deletions cuda/CudaPlotContext.h
Original file line number Diff line number Diff line change
@@ -7,11 +7,16 @@
#include "plotting/PlotTypes.h"
#include "plotting/PlotWriter.h"
#include "GpuStreams.h"
#include "GpuQueue.h"
#include "util/StackAllocator.h"
#include "fse/fse.h"
#include "threading/Fence.h"
#include "plotting/GlobalPlotConfig.h"
#include "threading/ThreadPool.h"
#include "plotting/BufferChain.h"
#include "plotting/DiskBuffer.h"
#include "plotting/DiskBucketBuffer.h"
#include <filesystem>

#include "cub/device/device_radix_sort.cuh"
// #include <cub/device/device_radix_sort.cuh>
@@ -29,7 +34,51 @@ using namespace cooperative_groups;
#endif


struct CudaK32ParkContext
{
Span<byte> table7Memory; // Memory buffer reserved for finalizing table7 and writing C parks
BufferChain* parkBufferChain;
uint32 maxParkBuffers; // Maximum number of park buffers
uint64* hostRetainedLinePoints;
};

struct CudaK32HybridMode
{
// For clarity, these are the file names for the disk buffers
// whose disk space will be shared for temp data in both phase 1 and phase 3.
// The name indicates their usage and in which phase.
static constexpr std::string_view Y_DISK_BUFFER_FILE_NAME = "p1y-p3index.tmp";
static constexpr std::string_view META_DISK_BUFFER_FILE_NAME = "p1meta-p3rmap.tmp";
static constexpr std::string_view LPAIRS_DISK_BUFFER_FILE_NAME = "p1unsortedx-p1lpairs-p3lp-p3-lmap.tmp";

static constexpr std::string_view P3_RMAP_DISK_BUFFER_FILE_NAME = META_DISK_BUFFER_FILE_NAME;
static constexpr std::string_view P3_INDEX_DISK_BUFFER_FILE_NAME = Y_DISK_BUFFER_FILE_NAME;
static constexpr std::string_view P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME = LPAIRS_DISK_BUFFER_FILE_NAME;

DiskQueue* temp1Queue; // Tables Queue
DiskQueue* temp2Queue; // Metadata Queue (could be the same as temp1Queue)

DiskBucketBuffer* metaBuffer; // Enabled in < 128G mode
DiskBucketBuffer* yBuffer; // Enabled in < 128G mode
DiskBucketBuffer* unsortedL; // Unsorted Xs (or L pairs in < 128G) are written to disk (uint64 entries)
DiskBucketBuffer* unsortedR; // Unsorted R pairs in < 128G mode

DiskBuffer* tablesL[7];
DiskBuffer* tablesR[7];

GpuDownloadBuffer _tablesL[7];
GpuDownloadBuffer _tablesR[7];

struct
{
// #NOTE: These buffers shared the same file-backed storage as
// with other buffers in phase 1.
DiskBucketBuffer* rMapBuffer; // Step 1
DiskBucketBuffer* indexBuffer; // X-step/Step 2
DiskBucketBuffer* lpAndLMapBuffer; // X-step/Step 2 (LP) | Step 3 (LMap)

} phase3;
};

struct CudaK32Phase2
{
@@ -64,11 +113,12 @@ struct CudaK32Phase3
};

uint64 pairsLoadOffset;


// Device buffers
uint32* devBucketCounts;
uint32* devPrunedEntryCount;


// Host buffers
union {
RMap* hostRMap;
uint32* hostIndices;
@@ -79,12 +129,6 @@ struct CudaK32Phase3
uint64* hostLinePoints;
};

// #TODO: Remove this when we sort-out all of the buffer usage
// uint64* hostMarkingTables[6]; // Set by Phase 2


// uint32* hostBucketCounts;

uint32 prunedBucketCounts[7][BBCU_BUCKET_COUNT];
uint64 prunedTableEntryCounts[7];

@@ -111,9 +155,10 @@ struct CudaK32Phase3
// Step 2
struct {
GpuUploadBuffer rMapIn; // RMap from step 1
GpuUploadBuffer lMapIn; // Output map (uint64) from the previous table run. Or during L table 1, it is inlined x values
GpuUploadBuffer lMapIn; // Output map (uint64) from the previous table run. Or, when L table is the first stored table, it is inlined x values
GpuDownloadBuffer lpOut; // Output line points (uint64)
GpuDownloadBuffer indexOut; // Output source line point index (uint32) (taken from the rMap source value)
GpuDownloadBuffer parksOut; // Output P7 parks on the last table
uint32* devLTable[2]; // Unpacked L table bucket

uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
@@ -123,7 +168,7 @@ struct CudaK32Phase3
struct {
GpuUploadBuffer lpIn; // Line points from step 2
GpuUploadBuffer indexIn; // Indices from step 2
GpuDownloadBuffer mapOut; // lTable for next step 1
GpuDownloadBuffer mapOut; // lTable for next step 2
GpuDownloadBuffer parksOut; // Downloads park buffers to host

uint32* hostParkOverrunCount;
@@ -137,7 +182,6 @@ struct CudaK32Phase3
FSE_CTable* devCTable;
uint32* devParkOverrunCount;

Fence* parkFence;
std::atomic<uint32> parkBucket;

uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
@@ -178,8 +222,9 @@ struct CudaK32PlotContext
int32 cudaDevice = -1;
cudaDeviceProp* cudaDevProps = nullptr;
bool downloadDirect = false;
TableId firstStoredTable = TableId::Table2; // First non-dropped table that has back pointers
ThreadPool* threadPool = nullptr;

TableId table = TableId::Table1; // Current table being generated
uint32 bucket = 0; // Current bucket being processed

@@ -192,6 +237,7 @@ struct CudaK32PlotContext
PlotRequest plotRequest;
PlotWriter* plotWriter = nullptr;
Fence* plotFence = nullptr;
Fence* parkFence = nullptr;

// Root allocations
size_t allocAlignment = 0;
@@ -263,8 +309,6 @@ struct CudaK32PlotContext
uint32* hostBucketSlices = nullptr;
uint32* hostTableL = nullptr;
uint16* hostTableR = nullptr;
uint32* hostTableSortedL = nullptr;
uint16* hostTableSortedR = nullptr;

union {
uint32* hostMatchCount = nullptr;
@@ -279,6 +323,14 @@ struct CudaK32PlotContext
CudaK32Phase2* phase2 = nullptr;
CudaK32Phase3* phase3 = nullptr;

CudaK32HybridMode* diskContext = nullptr;
CudaK32ParkContext* parkContext = nullptr;
bool useParkContext = false;

// Used when '--check' is enabled
struct GreenReaperContext* grCheckContext = nullptr;
class PlotChecker* plotChecker = nullptr;

struct
{
Duration uploadTime = Duration::zero(); // Host-to-device wait time
@@ -359,7 +411,7 @@ inline uint32 CudaK32PlotGetOutputIndex( CudaK32PlotContext& cx )
}

//-----------------------------------------------------------
inline bool CudaK32PlotIsOutputInterleaved( CudaK32PlotContext& cx )
inline bool CudaK32PlotIsOutputVertical( CudaK32PlotContext& cx )
{
return CudaK32PlotGetOutputIndex( cx ) == 0;
}
113 changes: 69 additions & 44 deletions cuda/CudaPlotPhase2.cu
Original file line number Diff line number Diff line change
@@ -20,8 +20,7 @@
static void CudaK32PlotAllocateBuffersTest( CudaK32PlotContext& cx );

#define MARK_TABLE_BLOCK_THREADS 128
#define P2_BUCKET_COUNT BBCU_BUCKET_COUNT
#define P2_ENTRIES_PER_BUCKET BBCU_BUCKET_ALLOC_ENTRY_COUNT //((1ull<<BBCU_K)/P2_BUCKET_COUNT)
#define P2_ENTRIES_PER_BUCKET BBCU_BUCKET_ALLOC_ENTRY_COUNT //((1ull<<BBCU_K)/BBCU_BUCKET_COUNT)


inline size_t GetMarkingTableByteSize()
@@ -30,7 +29,8 @@ inline size_t GetMarkingTableByteSize()
}

template<bool useRMarks>
__global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, const uint16* rPairs, byte* marks, const uint64* rTableMarks, const uint32 rOffset )
__global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, const uint16* rPairs,
byte* marks, const uint64* rTableMarks, const uint32 rOffset )
{
const uint32 gid = blockIdx.x * blockDim.x + threadIdx.x;

@@ -39,11 +39,11 @@ __global__ void CudaMarkTables( const uint32 entryCount, const uint32* lPairs, c
return;

if constexpr ( useRMarks )
{
{
if( !CuBitFieldGet( rTableMarks, rOffset + gid ) )
return;
}

const uint32 l = lPairs[gid];
const uint32 r = l + rPairs[gid];

@@ -117,12 +117,12 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield,

ASSERT( (uint64)blockCount * blockThreadCount * 64 == tableEntryCount );

#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
#define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount
CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
#else
#else
#define G_PRUNED_COUNTS
#endif
#endif

ASSERT_DOES_NOT_OVERLAP2( bitfield, bytefield, GetMarkingTableBitFieldSize(), GetMarkingTableByteSize() );

@@ -131,8 +131,11 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield,

void LoadPairs( CudaK32PlotContext& cx, CudaK32Phase2& p2, const TableId rTable, const uint32 bucket )
{
if( bucket >= BBCU_BUCKET_COUNT )
return;

const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable];
const uint32 entryCount = BBCU_BUCKET_ENTRY_COUNT;//(uint32)std::min( (uint64)BBCU_BUCKET_ENTRY_COUNT, tableEntryCount - p2.pairsLoadOffset );// cx.bucketCounts[(int)rTable][bucket];
const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];

// uint32* hostPairsL = cx.hostTableSortedL + p2.pairsLoadOffset;
// uint16* hostPairsR = cx.hostTableSortedR + p2.pairsLoadOffset;
@@ -163,42 +166,48 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )

byte* devLMarks = p2.devMarkingTable;

if( cx.cfg.hybrid128Mode )
{
cx.diskContext->tablesL[(int)rTable]->Swap();
cx.diskContext->tablesR[(int)rTable]->Swap();

p2.pairsLIn.AssignDiskBuffer( cx.diskContext->tablesL[(int)rTable] );
p2.pairsRIn.AssignDiskBuffer( cx.diskContext->tablesR[(int)rTable] );
}

// Zero-out marks
CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );

// Load first bucket's worth of pairs
LoadPairs( cx, p2, rTable, 0 );

uint32 rOffset = 0;
for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
{
const bool isLastBucket = bucket + 1 == P2_BUCKET_COUNT;
// Mark the table, buckey by bucket
uint32 rTableGlobalIndexOffset = 0;

// Load next set of pairs in the background
if( !isLastBucket )
LoadPairs( cx, p2, rTable, bucket + 1 );
for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
// Load next set of pairs in the background (if there is another bucket)
LoadPairs( cx, p2, rTable, bucket + 1 );

const uint64 tableEntryCount = cx.tableEntryCounts[(int)rTable];
const uint32 entryCount = isLastBucket ? tableEntryCount - (BBCU_BUCKET_ENTRY_COUNT * (BBCU_BUCKET_COUNT-1)): BBCU_BUCKET_ENTRY_COUNT;
// const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];
const uint32 entryCount = cx.bucketCounts[(int)rTable][bucket];

// Wait for pairs to be ready
const uint32* devLPairs = p2.pairsLIn.GetUploadedDeviceBufferT<uint32>( cx.computeStream );
const uint16* devRPairs = p2.pairsRIn.GetUploadedDeviceBufferT<uint16>( cx.computeStream );


// Mark
const uint32 blockCount = (uint32)CDiv( entryCount, MARK_TABLE_BLOCK_THREADS );

if( rTable == TableId::Table7 )
CudaMarkTables<false><<<blockCount, MARK_TABLE_BLOCK_THREADS, 0, cx.computeStream>>>( entryCount, devLPairs, devRPairs, devLMarks, nullptr, 0 );
else
CudaMarkTables<true ><<<blockCount, MARK_TABLE_BLOCK_THREADS, 0, cx.computeStream>>>( entryCount, devLPairs, devRPairs, devLMarks, p2.devRMarks[(int)rTable], rOffset );
CudaMarkTables<true ><<<blockCount, MARK_TABLE_BLOCK_THREADS, 0, cx.computeStream>>>( entryCount, devLPairs, devRPairs, devLMarks, p2.devRMarks[(int)rTable], rTableGlobalIndexOffset );

p2.pairsLIn.ReleaseDeviceBuffer( cx.computeStream );
p2.pairsRIn.ReleaseDeviceBuffer( cx.computeStream );

rOffset += entryCount;
rTableGlobalIndexOffset += entryCount;
}

// Convert the bytefield marking table to a bitfield
@@ -209,14 +218,14 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
// Download bitfield marks
// uint64* hostBitField = p2.hostBitFieldAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize() );
uint64* hostBitField = cx.hostMarkingTables[(int)lTable];

// #TODO: Do download and copy again, for now just store all of them in this pinned buffer
// cx.phase3->hostMarkingTables[(int)lTable] = hostBitField;
p2.outMarks.Download( hostBitField, GetMarkingTableBitFieldSize(), cx.computeStream );

// p2.outMarks.DownloadAndCopy( hostBitField, cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize(), cx.computeStream );
// p2.outMarks.Download( cx.hostMarkingTables[(int)lTable], GetMarkingTableBitFieldSize() );


#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
{
@@ -370,6 +379,9 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx )
MarkTable( cx, p2 );
p2.outMarks.WaitForCompletion();
p2.outMarks.Reset();
p2.pairsLIn.Reset();
p2.pairsRIn.Reset();

const auto elapsed = TimerEnd( timer );
Log::Line( "Marked Table %u in %.2lf seconds.", rTable, elapsed );

@@ -380,7 +392,7 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx )
}

// Wait for everything to complete

// p2.outMarks.WaitForCopyCompletion(); // #TODO: Re-activate this when re-enabling copy
p2.outMarks.WaitForCompletion();
p2.outMarks.Reset();
@@ -392,30 +404,39 @@ void CudaK32PlotPhase2( CudaK32PlotContext& cx )
///
void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
{
const size_t alignment = cx.allocAlignment;
GpuStreamDescriptor desc{};

desc.entriesPerSlice = P2_ENTRIES_PER_BUCKET;
desc.sliceCount = 1;
desc.sliceAlignment = cx.allocAlignment;
desc.bufferCount = BBCU_DEFAULT_GPU_BUFFER_COUNT;
desc.deviceAllocator = acx.devAllocator;
desc.pinnedAllocator = nullptr; // Start in direct mode (no intermediate pinined buffers)

if( cx.cfg.hybrid128Mode )
{
desc.pinnedAllocator = acx.pinnedAllocator;
desc.sliceAlignment = cx.diskContext->temp1Queue->BlockSize();
}

IAllocator& devAllocator = *acx.devAllocator;
IAllocator& pinnedAllocator = *acx.pinnedAllocator;
if( !cx.downloadDirect )
desc.pinnedAllocator = acx.pinnedAllocator;

CudaK32Phase2& p2 = *cx.phase2;

const size_t markingTableByteSize = GetMarkingTableByteSize();
const size_t markingTableBitFieldSize = GetMarkingTableBitFieldSize();

p2.devPrunedCount = devAllocator.CAlloc<uint32>( 1, alignment );
p2.devMarkingTable = devAllocator.AllocT<byte>( markingTableByteSize, alignment );

p2.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
sizeof( uint32 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun );
// Device buffers
p2.devPrunedCount = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
p2.devMarkingTable = acx.devAllocator->AllocT<byte>( markingTableByteSize, acx.alignment );

p2.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBuffer(
sizeof( uint16 ) * P2_ENTRIES_PER_BUCKET, devAllocator, pinnedAllocator, alignment, acx.dryRun );
// Upload/Download streams
p2.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( desc, acx.dryRun );
p2.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( desc, acx.dryRun );

p2.outMarks = cx.gpuDownloadStream[0]->CreateDirectDownloadBuffer(
markingTableBitFieldSize, devAllocator, alignment, acx.dryRun );

// These buffers are safe to use at this point
// p2.hostBitFieldAllocator = new StackAllocator( cx.hostTableR, sizeof( uint32 ) * BBCU_TABLE_ALLOC_ENTRY_COUNT );
desc.entriesPerSlice = markingTableBitFieldSize;
p2.outMarks = cx.gpuDownloadStream[0]->CreateDownloadBufferT<byte>( desc, acx.dryRun );
}


@@ -550,7 +571,7 @@ void DbgValidateTable( CudaK32PlotContext& cx )
{
{
uint64 totalCount = 0;
for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
totalCount += cx.bucketCounts[(int)rt][bucket];

ASSERT( totalCount == cx.tableEntryCounts[(int)rt] );
@@ -562,7 +583,7 @@ void DbgValidateTable( CudaK32PlotContext& cx )

Pairs hostRTablePairs = cx.hostBackPointers[(int)rt];

for( uint32 bucket = 0; bucket < P2_BUCKET_COUNT; bucket++ )
for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
const uint32 rTableBucketEntryCount = cx.bucketCounts[(int)rt][bucket];

@@ -638,9 +659,13 @@ void DbgWriteMarks( CudaK32PlotContext& cx, const TableId table )
{
char path[512];

std::string baseUrl = DBG_BBCU_DBG_DIR;
if( cx.cfg.hybrid128Mode )
baseUrl += "disk/";

Log::Line( "[DEBUG] Writing marking table %u to disk...", table+1 );
{
sprintf( path, "%smarks%d.tmp", DBG_BBCU_DBG_DIR, (int)table+1 );
sprintf( path, "%smarks%d.tmp", baseUrl.c_str(), (int)table+1 );

const uint64* marks = cx.hostMarkingTables[(int)table];

375 changes: 285 additions & 90 deletions cuda/CudaPlotPhase3.cu

Large diffs are not rendered by default.

29 changes: 14 additions & 15 deletions cuda/CudaPlotPhase3Internal.h
Original file line number Diff line number Diff line change
@@ -10,8 +10,18 @@
#include "plotdisk/jobs/IOJob.h"
#include "algorithm/RadixSort.h"
#include "plotmem/ParkWriter.h"
#include "b3/blake3.h"

void DbgValidateStep2Output( CudaK32PlotContext& cx );

void DbgHashData( const void* data, size_t size, const char* name, uint32 index );

void DbgFinishAndPrintHash( blake3_hasher& hasher, const char* name, uint32 index );
template<typename T>
inline void DbgHashDataT( const T* data, uint64 count, const char* name, uint32 index )
{
DbgHashData( data, (size_t)count * sizeof( T ), name, index );
}
#endif

using LMap = CudaK32Phase3::LMap;
@@ -27,22 +37,11 @@ static_assert( alignof( LMap ) == sizeof( uint32 ) );
#define P3_PRUNED_TABLE_MAX_ENTRIES BBCU_TABLE_ALLOC_ENTRY_COUNT //(P3_PRUNED_BUCKET_MAX*BBCU_BUCKET_COUNT)
#define P3_PRUNED_MAX_PARKS_PER_BUCKET ((P3_PRUNED_BUCKET_MAX/kEntriesPerPark)+2)

static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u; // Should be more than enough

//static constexpr size_t P3_LP_BUCKET_COUNT = BBCU_BUCKET_COUNT;// << 1;
//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT = BBCU_MAX_SLICE_ENTRY_COUNT;
//static constexpr uint32 P3_LP_BUCKET_BITS = BBC_BUCKET_BITS;

// static constexpr uint32 P3_LP_BUCKET_BITS = (uint32)(CuBBLog2( P3_LP_BUCKET_COUNT ));
//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
//BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
// static constexpr size_t P3_LP_BUCKET_ENTRY_COUNT = P3_LP_SLICE_ENTRY_COUNT * P3_LP_BUCKET_COUNT;

//static constexpr size_t P3_LP_BUCKET_STRIDE = BBCU_BUCKET_ALLOC_ENTRY_COUNT;

// static constexpr size_t P3_LP_BUCKET_ALLOC_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
// BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
// //static constexpr size_t P3_LP_TABLE_ALLOC_COUNT = P3_LP_BUCKET_STRIDE * BBCU_BUCKET_COUNT;
static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u; // Should be more than enough
static constexpr size_t P3_MAX_P7_PARKS_PER_BUCKET = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
static constexpr size_t P3_PARK_7_SIZE = CalculatePark7Size( BBCU_K );
static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= P3_MAX_P7_PARKS_PER_BUCKET * P3_PARK_7_SIZE );

static constexpr size_t MAX_PARK_SIZE = CalculateParkSize( TableId::Table1 );
static constexpr size_t DEV_MAX_PARK_SIZE = CuCDiv( MAX_PARK_SIZE, sizeof( uint64 ) ) * sizeof( uint64 ); // Align parks to 64 bits, for easier writing of stubs
85 changes: 68 additions & 17 deletions cuda/CudaPlotPhase3Step2.cu
Original file line number Diff line number Diff line change
@@ -248,7 +248,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )

s2.rMapIn.UploadArrayT<RMap>( rmap, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, rSliceCounts );
};


const TableId rTable = cx.table;
const TableId lTable = rTable-1;
@@ -309,15 +309,14 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
const auto* rMap = (RMap*)s2.rMapIn.GetUploadedDeviceBuffer( cx.computeStream );
const uint32 rEntryCount = p3.prunedBucketCounts[(int)rTable][bucket];


uint64* devOutLPs = (uint64*)s2.lpOut .LockDeviceBuffer( cx.computeStream );
uint32* devOutIndices = (uint32*)s2.indexOut.LockDeviceBuffer( cx.computeStream );

ConvertRMapToLinePoints( cx, rEntryCount, rTableOffset, devLTable, rMap, devOutLPs, devOutIndices, cx.computeStream );
s2.rMapIn.ReleaseDeviceBuffer( cx.computeStream );
rTableOffset += rEntryCount;


// Horizontal download (write 1 row)
s2.lpOut .Download2DT<uint64>( p3.hostLinePoints + (size_t)bucket * P3_PRUNED_BUCKET_MAX , P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX , P3_PRUNED_SLICE_MAX, cx.computeStream );
s2.indexOut.Download2DT<uint32>( p3.hostIndices + (size_t)bucket * P3_PRUNED_BUCKET_MAX*3, P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_SLICE_MAX*3, P3_PRUNED_SLICE_MAX, cx.computeStream );
@@ -354,7 +353,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )

CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
cudaMemcpyDeviceToHost, downloadStream ) );

memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) );

CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
@@ -370,8 +369,15 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
ASSERT( p3.prunedBucketCounts[(int)rTable][bucket] <= P3_PRUNED_BUCKET_MAX );
}

if( cx.cfg.hybrid16Mode )
{
cx.diskContext->phase3.rMapBuffer->Swap();
cx.diskContext->phase3.lpAndLMapBuffer->Swap();
cx.diskContext->phase3.indexBuffer->Swap();
}

// #if _DEBUG
// if( cx.table > TableId::Table3 )
// // if( cx.table > TableId::Table3 )
// {
// DbgValidateStep2Output( cx );
// }
@@ -402,23 +408,26 @@ void WritePark7( CudaK32PlotContext& cx )
auto& p3 = *cx.phase3;
auto& s2 = p3.step2;


// Load initial bucket
LoadBucket( cx, 0 );

// Begin park 7 table in plot
cx.plotWriter->BeginTable( PlotTable::Table7 );

constexpr size_t parkSize = CalculatePark7Size( BBCU_K );
constexpr size_t parkSize = P3_PARK_7_SIZE;
constexpr size_t parkFieldCount = parkSize / sizeof( uint64 );
static_assert( parkFieldCount * sizeof( uint64 ) == parkSize );

GpuDownloadBuffer& parkDownloader = cx.useParkContext ? s2.parksOut : s2.lpOut;

GpuDownloadBuffer& parkDownloader = s2.lpOut;

constexpr size_t maxParksPerBucket = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
constexpr size_t maxParksPerBucket = P3_MAX_P7_PARKS_PER_BUCKET;
static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= maxParksPerBucket * parkSize );

if( cx.useParkContext )
{
cx.parkContext->parkBufferChain->Reset();
}

// Host stuff
constexpr size_t hostMetaTableSize = sizeof( RMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
@@ -427,9 +436,10 @@ void WritePark7( CudaK32PlotContext& cx )
const uint64 tableEntryCount = cx.tableEntryCounts[(int)cx.table];
const size_t totalParkCount = CDiv( (size_t)tableEntryCount, kEntriesPerPark );

byte* hostParks = hostAllocator.AllocT<byte>( totalParkCount * parkSize );
byte* hostParkWriter = hostParks;
uint32* hostLastParkEntries = hostAllocator.CAlloc<uint32>( kEntriesPerPark );
byte* hostParks = cx.useParkContext ? nullptr : hostAllocator.AllocT<byte>( totalParkCount * parkSize );
byte* hostParksWriter = cx.useParkContext ? nullptr : hostParks;
uint32* hostLastParkEntries = cx.useParkContext ? (uint32*)cx.parkContext->hostRetainedLinePoints :
hostAllocator.CAlloc<uint32>( kEntriesPerPark );

static_assert( kEntriesPerPark * maxParksPerBucket <= BBCU_BUCKET_ALLOC_ENTRY_COUNT * 2 );
uint32* devIndexBuffer = s2.devLTable[0] + kEntriesPerPark;
@@ -479,14 +489,38 @@ void WritePark7( CudaK32PlotContext& cx )
// Download parks & write to plot
const size_t downloadSize = parkCount * parkSize;

parkDownloader.DownloadWithCallback( hostParkWriter, downloadSize,
if( cx.useParkContext )
{
ASSERT( downloadSize <= cx.parkContext->parkBufferChain->BufferSize() );

// Override the park buffer to be used when using a park context
hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket );

// Wait for the next park buffer to be available
parkDownloader.HostCallback([&cx]{
(void)cx.parkContext->parkBufferChain->GetNextBuffer();
});
}

parkDownloader.DownloadWithCallback( hostParksWriter, downloadSize,
[]( void* parksBuffer, size_t size, void* userData ) {

auto& cx = *reinterpret_cast<CudaK32PlotContext*>( userData );
cx.plotWriter->WriteTableData( parksBuffer, size );

// Release the buffer after the plot writer is done with it.
if( cx.useParkContext )
{
cx.plotWriter->CallBack([&cx](){
cx.parkContext->parkBufferChain->ReleaseNextBuffer();
});
}

}, &cx, cx.computeStream );

hostParkWriter += downloadSize;
hostParksWriter += downloadSize;
if( cx.useParkContext )
hostParksWriter = nullptr;
}

// Wait for parks to complete downloading
@@ -499,9 +533,19 @@ void WritePark7( CudaK32PlotContext& cx )
// Was there a left-over park?
if( retainedEntryCount > 0 )
{
if( cx.useParkContext )
hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer();

// Submit last park to plot
TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParkWriter );
cx.plotWriter->WriteTableData( hostParkWriter, parkSize );
TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParksWriter );
cx.plotWriter->WriteTableData( hostParksWriter, parkSize );

if( cx.useParkContext )
{
cx.plotWriter->CallBack([&cx](){
cx.parkContext->parkBufferChain->ReleaseNextBuffer();
});
}
}
cx.plotWriter->EndTable();

@@ -534,6 +578,7 @@ void _DbgValidateOutput( CudaK32PlotContext& cx )
auto& s2 = p3.step2;

// Validate line points...
Log::Debug( "[DEBUG] Validating line points..." );
uint64* refLinePoints = bbcvirtallocboundednuma<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
uint64* tmpLinePoints = bbcvirtallocboundednuma<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
uint32* indices = bbcvirtallocboundednuma<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT );
@@ -614,9 +659,13 @@ void _DbgValidateOutput( CudaK32PlotContext& cx )
}
}

DbgHashDataT( refLinePoints, prunedEntryCount, "line_points", (uint32)cx.table+1 );

bbvirtfreebounded( refLinePoints );
bbvirtfreebounded( tmpLinePoints );
bbvirtfreebounded( indices );

Log::Debug( "[DEBUG] Line point validation OK" );
}

#endif
@@ -659,6 +708,8 @@ void DbgDumpSortedLinePoints( CudaK32PlotContext& cx )
ThreadPool& pool = *cx.threadPool; //DbgGetThreadPool( cx );
RadixSort256::Sort<BB_MAX_JOBS>( pool, sortedLinePoints, tmpLinePoints, prunedEntryCount );

// DbgHashDataT( sortedLinePoints, prunedEntryCount, "sorted_line_points", (uint32)cx.table+1 );

// Write to disk
{
char filePath[1024] = {};
113 changes: 89 additions & 24 deletions cuda/CudaPlotPhase3Step3.cu
Original file line number Diff line number Diff line change
@@ -52,12 +52,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )

// Load CTable
const bool isCompressed = cx.gCfg->compressionLevel > 0 && lTable <= (TableId)cx.gCfg->numDroppedTables;
const uint32 stubBitSize = !isCompressed ? (BBCU_K - kStubMinusBits) : cx.gCfg->compressionInfo.subtSizeBits;
const uint32 stubBitSize = !isCompressed ? (BBCU_K - kStubMinusBits) : cx.gCfg->compressionInfo.stubSizeBits;
const TableId firstTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;


const bool isFirstSerializedTable = firstTable == rTable;

const size_t cTableSize = !isCompressed ? sizeof( CTable_0 ) : cx.gCfg->cTableSize; ASSERT( cTableSize <= P3_MAX_CTABLE_SIZE );
const FSE_CTable* hostCTable = !isCompressed ? CTables[(int)lTable] : cx.gCfg->ctable;

// (upload must be loaded before first bucket, on the same stream)
CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice,
s3.lpIn.GetQueue()->GetStream() ) );
@@ -75,13 +77,32 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
const size_t hostParkSize = isCompressed ? cx.gCfg->compressionInfo.tableParkSize : CalculateParkSize( lTable );
ASSERT( DEV_MAX_PARK_SIZE >= hostParkSize );

// #TODO: Move this allocation to the beginning
if( s3.parkFence == nullptr )
s3.parkFence = new Fence();

byte* hostParksWriter = (byte*)cx.hostBackPointers[(int)rTable].left; //(byte*)cx.hostTableL;
uint64* hostRetainedEntries = nullptr;

if( cx.cfg.hybrid128Mode )
{
hostParksWriter = (byte*)cx.hostTableL;

if( !isFirstSerializedTable && !cx.useParkContext )
{
// Ensure the this buffer is no longer in use (the last table finished writing to disk.)
const bool willWaitForParkFence = cx.parkFence->Value() < BBCU_BUCKET_COUNT;
if( willWaitForParkFence )
Log::Line( " Waiting for parks buffer to become available." );

Duration parkWaitTime;
cx.parkFence->Wait( BBCU_BUCKET_COUNT, parkWaitTime );

if( willWaitForParkFence )
Log::Line( " Waited %.3lf seconds for the park buffer to be released.", TicksToSeconds( parkWaitTime ) );
}
}
if( cx.useParkContext )
{
cx.parkContext->parkBufferChain->Reset();
}

// if( !isCompressed && lTable == TableId::Table1 )
// hostParksWriter = (byte*)cx.hostBackPointers[(int)TableId::Table2].left;

@@ -101,7 +122,7 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
// Set initial event LP stream event as set.
CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );

s3.parkFence->Reset( 0 );
cx.parkFence->Reset( 0 );
s3.parkBucket = 0;

for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
@@ -200,7 +221,8 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
// No more buckets so we have to compress this last park on the CPU
CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) );

hostRetainedEntries = (uint64*)( hostParksWriter + hostParkSize * parkCount );
hostRetainedEntries = cx.useParkContext ? cx.parkContext->hostRetainedLinePoints :
(uint64*)( hostParksWriter + hostParkSize * parkCount );
CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) );
}
}
@@ -209,18 +231,42 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )


// Download parks
if( cx.useParkContext )
{
ASSERT( hostParkSize * parkCount <= cx.parkContext->parkBufferChain->BufferSize() );

// Override the park buffer to be used when using a park context
hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket );

// Wait for the next park buffer to be available
s3.parksOut.HostCallback([&cx]{
(void)cx.parkContext->parkBufferChain->GetNextBuffer();
});
}

s3.parksOut.Download2DWithCallback( hostParksWriter, hostParkSize, parkCount, hostParkSize, DEV_MAX_PARK_SIZE,
[]( void* parksBuffer, size_t size, void* userData ) {

auto& cx = *reinterpret_cast<CudaK32PlotContext*>( userData );
auto& s3 = cx.phase3->step3;

cx.plotWriter->WriteTableData( parksBuffer, size );
cx.plotWriter->SignalFence( *s3.parkFence, ++s3.parkBucket );
cx.plotWriter->SignalFence( *cx.parkFence, ++s3.parkBucket );

// Release the buffer after the plot writer is done with it.
if( cx.useParkContext )
{
cx.plotWriter->CallBack([&cx](){
cx.parkContext->parkBufferChain->ReleaseNextBuffer();
});
}

}, &cx, lpStream, cx.downloadDirect );

hostParksWriter += hostParkSize * parkCount;

if( cx.useParkContext )
hostParksWriter = nullptr;
}

// Copy park overrun count
@@ -242,18 +288,24 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
// Was there a left-over park?
if( retainedLPCount > 0 )
{
ASSERT( hostRetainedEntries );

if( cx.useParkContext )
hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer();

uint64 lastParkEntries[kEntriesPerPark];
bbmemcpy_t( lastParkEntries, hostRetainedEntries, retainedLPCount );

WritePark( hostParkSize, retainedLPCount, lastParkEntries, hostParksWriter, stubBitSize, hostCTable );
cx.plotWriter->WriteTableData( hostParksWriter, hostParkSize );

if( cx.useParkContext )
{
cx.plotWriter->CallBack([&cx](){
cx.parkContext->parkBufferChain->ReleaseNextBuffer();
});
}
}
cx.plotWriter->EndTable();

// Update buckets counts for L table
// #TODO: These should match Step 1 pruned entry count I believe, so just copy?

memset( p3.prunedBucketCounts[(int)rTable], 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT );
for( uint32 i = 0; i < BBCU_BUCKET_COUNT; i++ )
@@ -266,12 +318,19 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
s3.lpIn .Reset();
s3.indexIn.Reset();

if( cx.cfg.hybrid16Mode )
{
cx.diskContext->phase3.lpAndLMapBuffer->Swap();
cx.diskContext->phase3.indexBuffer->Swap();
}


// #if _DEBUG
// //if( cx.table >= TableId::Table6 )
// //{
// DbgValidateLMap( cx );
// DbgValidateLMapData( cx );
// // DbgValidateLMap( cx );
// // DbgValidateLMapData( cx );

// // DbgSaveLMap( cx );
// //}
// #endif
@@ -386,7 +445,7 @@ void DbgSaveLMap( CudaK32PlotContext& cx )

char path[512];
sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 );

const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
int err;
FatalIf( !IOJob::WriteToFile( path, p3.hostLMap, writeSize, err ),
@@ -399,7 +458,7 @@ void DbgSaveLMap( CudaK32PlotContext& cx )
sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.buckets.tmp", (uint)cx.table+1 );
FatalIf( !IOJob::WriteToFileUnaligned( path, p3.prunedBucketCounts[(int)cx.table], sizeof( uint32 ) * BBCU_BUCKET_COUNT, err ),
"[DEBUG] Failed to write LMap buckets with error: %d", err );

Log::Line( " [DEBUG] OK" );
}

@@ -410,7 +469,7 @@ void DbgLoadLMap( CudaK32PlotContext& cx )

char path[512];
sprintf( path, DBG_BBCU_DBG_DIR "p3.lmap.t%u.tmp", (uint)cx.table+1 );

const size_t writeSize = sizeof( LMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
int err;
FatalIf( !IOJob::ReadFromFile( path, p3.hostLMap, writeSize, err ),
@@ -438,10 +497,12 @@ void DbgValidateLMap( CudaK32PlotContext& cx )
auto& p3 = *cx.phase3;
auto& s3 = p3.step3;

LMap* lMap = bbcvirtallocbounded<LMap>( BBCU_TABLE_ENTRY_COUNT );
LMap* lMap = bbcvirtallocbounded<LMap>( BBCU_BUCKET_ALLOC_ENTRY_COUNT );


{
// blake3_hasher hasher;
// blake3_hasher_init( &hasher );

for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
const LMap* reader = p3.hostLMap + bucket * P3_PRUNED_BUCKET_MAX;
@@ -471,14 +532,18 @@ void DbgValidateLMap( CudaK32PlotContext& cx )
ASSERT( map.sourceIndex || map.sortedIndex );
ASSERT( ( map.sourceIndex >> ( 32 - BBC_BUCKET_BITS ) ) == bucket );
}

// Hash bucket
// blake3_hasher_update( &hasher, lMap, sizeof( LMap ) * entryCount );
}


// Print hash
// DbgFinishAndPrintHash( hasher, "l_map", (uint)cx.table + 1 );
}

bbvirtfreebounded( lMap );

Log::Line( "[DEBUG] OK" );
Log::Line( "[DEBUG] LMap OK" );
}

//-----------------------------------------------------------
@@ -566,7 +631,7 @@ void _DbgValidateLMapData( CudaK32PlotContext& cx )
bbvirtfreebounded( dstIndices );
bbvirtfreebounded( tmpIndices );

Log::Line( "[DEBUG] OK" );
Log::Line( "[DEBUG] LMap uniqueness OK" );
}

#endif
1,010 changes: 853 additions & 157 deletions cuda/CudaPlotter.cu

Large diffs are not rendered by default.

24 changes: 19 additions & 5 deletions cuda/CudaPlotter.h
Original file line number Diff line number Diff line change
@@ -9,10 +9,22 @@ struct CudaK32PlotConfig
{
const GlobalPlotConfig* gCfg = nullptr;

uint32 deviceIndex = 0; // Which CUDA device to use when plotting//
bool disableDirectDownloads = false; // Don't allocate host tables using pinned buffers, instead
// download to intermediate pinned buffers then copy to the final host buffer.
// May be necessarry on Windows because of shared memory limitations (usual 50% of system memory)
uint32 deviceIndex = 0; // Which CUDA device to use when plotting/
bool disableDirectDownloads = false; // Don't allocate host tables using pinned buffers, instead
// download to intermediate pinned buffers then copy to the final host buffer.
// May be necessarry on Windows because of shared memory limitations (usual 50% of system memory)

bool hybrid128Mode = false; // Enable hybrid disk-offload w/ 128G of RAM.
bool hybrid16Mode = false; // Enable hybrid disk-offload w/ 64G of RAM.

const char* temp1Path = nullptr; // For 128G RAM mode
const char* temp2Path = nullptr; // For 64G RAM mode

bool temp1DirectIO = true; // Use direct I/O for temp1 files
bool temp2DirectIO = true; // Use direct I/O for temp2 files

uint64 plotCheckCount = 0; // For performing plot check command after plotting
double plotCheckThreshhold = 0.6; // Proof/check threshhold below which plots will be deleted
};

class CudaK32Plotter : public IPlotter
@@ -28,4 +40,6 @@ class CudaK32Plotter : public IPlotter
private:
CudaK32PlotConfig _cfg = {};
struct CudaK32PlotContext* _cx = nullptr;;
};
};

void CudaK32PlotterPrintHelp();
385 changes: 385 additions & 0 deletions cuda/GpuDownloadStream.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,385 @@
#include "GpuStreams.h"
#include "GpuQueue.h"
#include "plotting/DiskBucketBuffer.h"
#include "plotting/DiskBuffer.h"


///
/// DownloadBuffer
///
void* GpuDownloadBuffer::GetDeviceBuffer()
{
const uint32 index = self->outgoingSequence % self->bufferCount;

CudaErrCheck( cudaEventSynchronize( self->events[index] ) );

return self->deviceBuffer[index];
}

void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream )
{
ASSERT( self->lockSequence >= self->outgoingSequence );
ASSERT( self->lockSequence - self->outgoingSequence < self->bufferCount );

const uint32 index = self->lockSequence % self->bufferCount;
self->lockSequence++;

// Wait for the device buffer to be free to be used by kernels
CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
return self->deviceBuffer[index];
}

void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size )
{
Download2D( hostBuffer, size, 1, size, size );
}

void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStream_t workStream, bool directOverride )
{
Download2D( hostBuffer, size, 1, size, size, workStream, directOverride );
}

void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream )
{
Panic( "Unavailable" );
// ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT );
// ASSERT( hostBuffer );
// ASSERT( workStream );
// ASSERT( self->lockSequence > 0 );
// ASSERT( self->outgoingSequence < self->lockSequence );
// ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );

// auto& cpy = self->copies[self->outgoingSequence];
// cpy.self = self;
// cpy.sequence = self->outgoingSequence;
// cpy.copy.hostBuffer = finalBuffer;
// cpy.copy.srcBuffer = hostBuffer;
// cpy.copy.size = size;


// const uint32 index = self->outgoingSequence % self->bufferCount;
// self->outgoingSequence++;

// void* pinnedBuffer = self->pinnedBuffer[index];
// const void* devBuffer = self->deviceBuffer[index];

// // Signal from the work stream when it has finished doing kernel work with the device buffer
// CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) );


// // Ensure the work stream has completed writing data to the device buffer
// cudaStream_t stream = self->queue->_stream;

// CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) );

// // Copy
// CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, size, cudaMemcpyDeviceToHost, stream ) );

// // Signal that the device buffer is free to be re-used
// CudaErrCheck( cudaEventRecord( self->events[index], stream ) );

// // Launch copy command
// CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){

// const CopyInfo& c = *reinterpret_cast<CopyInfo*>( userData );
// IGpuBuffer* self = c.self;

// auto& cmd = self->queue->GetCommand( GpuQueue::CommandType::Copy );
// cmd.copy.info = &c;

// self->queue->SubmitCommands();

// // Signal the download completed
// self->fence.Signal( ++self->completedSequence );
// }, &cpy ) );
}

void GpuDownloadBuffer::DownloadWithCallback( void* hostBuffer, const size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
{
Download2DWithCallback( hostBuffer, size, 1, size, size, callback, userData, workStream, directOverride );
}

void GpuDownloadBuffer::Download2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride, cudaStream_t workStream, bool directOverride )
{
Download2DWithCallback( hostBuffer, width, height, dstStride, srcStride, nullptr, nullptr, workStream, directOverride );
}

void GpuDownloadBuffer::Download2DWithCallback( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
GpuDownloadCallback callback, void* userData, cudaStream_t workStream, bool directOverride )
{
PerformDownload2D( hostBuffer, width, height, dstStride, srcStride,
callback, userData,
workStream, directOverride );
}

void GpuDownloadBuffer::PerformDownload2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
GpuDownloadCallback postCallback, void* postUserData,
cudaStream_t workStream, bool directOverride )
{
PanicIf( !(hostBuffer || self->pinnedBuffer[0] ), "" );
ASSERT( workStream );
ASSERT( self->lockSequence > 0 );
ASSERT( self->outgoingSequence < self->lockSequence );
ASSERT( self->lockSequence - self->outgoingSequence <= self->bufferCount );

const uint32 index = self->outgoingSequence++ % self->bufferCount;

void* pinnedBuffer = self->pinnedBuffer[index];
void* finalHostBuffer = hostBuffer;
const void* devBuffer = self->deviceBuffer[index];

const bool isDirect = (directOverride || self->pinnedBuffer[0] == nullptr) && !self->diskBuffer; ASSERT( isDirect || self->pinnedBuffer[0] );
const bool isSequentialCopy = dstStride == srcStride;
const size_t totalSize = height * width;


// Signal from the work stream when it has finished doing kernel work with the device buffer
CudaErrCheck( cudaEventRecord( self->workEvent[index], workStream ) );

// From the download stream, wait for the work stream to finish
cudaStream_t downloadStream = self->queue->_stream;
CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->workEvent[index] ) );


if( self->diskBuffer )
{
// Wait until the next disk buffer is ready for use.
// This also signals that the pinned buffer is ready for re-use
CallHostFunctionOnStream( downloadStream, [this](){
self->diskBuffer->GetNextWriteBuffer();
});

pinnedBuffer = self->diskBuffer->PeekWriteBufferForBucket( self->outgoingSequence-1 );
}

if( !isDirect )
{
// Ensure that the pinned buffer is ready for use
// (we signal pinned buffers are ready when using disks without events)
if( !self->diskBuffer )
CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->pinnedEvent[index] ) );

// Set host buffer as the pinned buffer
hostBuffer = pinnedBuffer;
}


// Copy from device to host buffer
// #NOTE: Since the pinned buffer is simply the same size (a full bucket) as the device buffer
// we also always copy as 1D if we're copying to our pinned buffer.
ASSERT( hostBuffer );
if( isSequentialCopy || hostBuffer == pinnedBuffer )
CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, totalSize, cudaMemcpyDeviceToHost, downloadStream ) );
else
CudaErrCheck( cudaMemcpy2DAsync( hostBuffer, dstStride, devBuffer, srcStride, width, height, cudaMemcpyDeviceToHost, downloadStream ) );

// Dispatch a host callback if one was set
if( postCallback )
{
CallHostFunctionOnStream( downloadStream, [=](){
(*postCallback)( finalHostBuffer, totalSize, postUserData );
});
}


// Signal that the device buffer is free to be re-used
CudaErrCheck( cudaEventRecord( self->deviceEvents[index], downloadStream ) );

if( self->diskBuffer )
{
// If it's a disk-based copy, then write the pinned buffer to disk
CallHostFunctionOnStream( downloadStream, [=]() {

auto* diskBucketBuffer = dynamic_cast<DiskBucketBuffer*>( self->diskBuffer );
if( diskBucketBuffer != nullptr )
diskBucketBuffer->Submit( srcStride );
else
static_cast<DiskBuffer*>( self->diskBuffer )->Submit( totalSize );
});

// #NOTE: We don't need to signal that the pinned buffer is ready for re-use here as
// we do that implicitly with DiskBuffer::GetNextWriteBuffer (see above).
}
else if( !isDirect )
{
// #TODO: Do this in a different host copy stream, and signal from there.
// #MAYBE: Perhaps use multiple host threads/streams to do host-to-host copies.
// for now do it on the same download stream, but we will be blocking the download stream,
// unless other download streams are used by other buffers.


ASSERT( hostBuffer == pinnedBuffer );
if( isSequentialCopy )
CudaErrCheck( cudaMemcpyAsync( finalHostBuffer, hostBuffer, totalSize, cudaMemcpyHostToHost, downloadStream ) );
else
CudaErrCheck( cudaMemcpy2DAsync( finalHostBuffer, dstStride, hostBuffer, srcStride, width, height, cudaMemcpyHostToHost, downloadStream ) );

// Signal the pinned buffer is free to be re-used
CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], downloadStream ) );
}
}

void GpuDownloadBuffer::CallHostFunctionOnStream( cudaStream_t stream, std::function<void()> func )
{
auto* fnCpy = new std::function<void()>( std::move( func ) );
CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) {

auto& fn = *reinterpret_cast<std::function<void()>*>( userData );
fn();
delete& fn;

}, fnCpy ) );
}

void GpuDownloadBuffer::HostCallback( std::function<void()> func )
{
CallHostFunctionOnStream( self->queue->GetStream(), func );
}

void GpuDownloadBuffer::GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback, void* userData )
{
ASSERT( width );
ASSERT( height );
ASSERT( hostBuffer );

const uint32 index = self->outgoingSequence % self->bufferCount;

// We need to block until the pinned buffer is available.
if( self->outgoingSequence > self->bufferCount-1 )
self->fence.Wait( self->outgoingSequence - self->bufferCount + 1 );

void* pinnedBuffer = self->pinnedBuffer[index];
const void* devBuffer = self->deviceBuffer[index];

//auto& cmd = self->commands[index];
//cmd.type = GpuQueue::CommandType::Copy2D;
//cmd.sequenceId = self->outgoingSequence++;
//cmd.finishedSignal = &self->fence;
//cmd.dstBuffer = hostBuffer;
//cmd.srcBuffer = pinnedBuffer;
//cmd.copy2d.width = width;
//cmd.copy2d.height = height;
//cmd.copy2d.dstStride = dstStride;
//cmd.copy2d.srcStride = srcStride;
//cmd.copy2d.callback = callback;
//cmd.copy2d.userData = userData;

outIndex = index;
outPinnedBuffer = pinnedBuffer;
outDevBuffer = devBuffer;
}


void GpuDownloadBuffer::DownloadAndPackArray( void* hostBuffer, const uint32 length, size_t srcStride, const uint32* counts, const uint32 elementSize )
{
ASSERT( length );
ASSERT( elementSize );
ASSERT( counts );

uint32 totalElements = 0;
for( uint32 i = 0; i < length; i++ )
totalElements += counts[i];

const size_t totalSize = (size_t)totalElements * elementSize;

uint32 index;
void* pinnedBuffer;
const void* devBuffer;
GetDownload2DCommand( hostBuffer, totalSize, 1, totalSize, totalSize, index, pinnedBuffer, devBuffer );


srcStride *= elementSize;

byte* dst = (byte*)pinnedBuffer;
const byte* src = (byte*)devBuffer;

cudaStream_t stream = self->queue->_stream;

// Copy all buffers from device to pinned buffer
for( uint32 i = 0; i < length; i++ )
{
const size_t copySize = counts[i] * (size_t)elementSize;

// #TODO: Determine if there's a cuda (jagged) array copy
CudaErrCheck( cudaMemcpyAsync( dst, src, copySize, cudaMemcpyDeviceToHost, stream ) );

src += srcStride;
dst += copySize;
}

// Signal that the device buffer is free
CudaErrCheck( cudaEventRecord( self->events[index], stream ) );

// Submit command to do the final copy from pinned to host
CudaErrCheck( cudaLaunchHostFunc( stream, GpuQueue::CopyPendingDownloadStream, self ) );
}

void GpuDownloadBuffer::WaitForCompletion()
{
if( self->outgoingSequence > 0 )
{
//const uint32 index = (self->outgoingSequence - 1) % self->bufferCount;

// cudaEvent_t event = self->completedEvents[index];
//const cudaError_t r = cudaEventQuery( event );

//if( r == cudaSuccess )
// return;

//if( r != cudaErrorNotReady )
// CudaErrCheck( r );

//CudaErrCheck( cudaEventSynchronize( event ) );


cudaStream_t downloadStream = self->queue->_stream;
// this->self->fence.Reset( 0 );
CallHostFunctionOnStream( downloadStream, [this](){
this->self->fence.Signal( this->self->outgoingSequence );
});
self->fence.Wait( self->outgoingSequence );

}
}

void GpuDownloadBuffer::WaitForCopyCompletion()
{
if( self->outgoingSequence > 0 )
{
self->copyFence.Wait( self->outgoingSequence );
}
}

void GpuDownloadBuffer::Reset()
{
self->lockSequence = 0;
self->outgoingSequence = 0;
self->completedSequence = 0;
self->copySequence = 0;
self->fence.Reset( 0 );
self->copyFence.Reset( 0 );
}

GpuQueue* GpuDownloadBuffer::GetQueue() const
{
return self->queue;
}

void GpuDownloadBuffer::AssignDiskBuffer( DiskBufferBase* diskBuffer )
{
// ASSERT( self->pinnedBuffer[0] );

void* nullBuffers[2] = { nullptr, nullptr };
if( self->diskBuffer )
self->diskBuffer->AssignWriteBuffers( nullBuffers );

self->diskBuffer = diskBuffer;
if( self->diskBuffer )
self->diskBuffer->AssignWriteBuffers( self->pinnedBuffer );
}

DiskBufferBase* GpuDownloadBuffer::GetDiskBuffer() const
{
return self->diskBuffer;
}
432 changes: 432 additions & 0 deletions cuda/GpuQueue.cu

Large diffs are not rendered by default.

188 changes: 188 additions & 0 deletions cuda/GpuQueue.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
#pragma once

#include "GpuStreams.h"
#include <functional>

class DiskQueue;

struct GpuStreamDescriptor
{
size_t entrySize;
size_t entriesPerSlice;
uint32 sliceCount;
uint32 sliceAlignment;
uint32 bufferCount;
IAllocator* deviceAllocator;
IAllocator* pinnedAllocator;
DiskQueue* diskQueue; // DiskQueue to use when disk offload mode is enabled.
const char* diskFileName; // File name to use when disk offload mode is enabled. The diskQueue must be set.
bool bucketedDiskBuffer; // If true, a DiskBucketBuffer will be used instead of a DiskBuffer.
bool directIO; // If true, direct I/O will be used when using disk offload mode.
};

typedef std::function<void()> GpuCallbackDispath;

class GpuQueue
{
friend struct IGpuBuffer;
friend struct GpuDownloadBuffer;
friend struct GpuUploadBuffer;

enum class CommandType
{
None = 0,
Copy,
CopyArray,
Callback,
};

struct Command
{
CommandType type;

union
{
struct CopyInfo* copy;

struct {
GpuDownloadCallback callback;
size_t copySize;
void* dstbuffer;
void* userData;
} callback;
};
};

public:

enum Kind
{
Downloader,
Uploader
};

GpuQueue( Kind kind );
virtual ~GpuQueue();

static size_t CalculateSliceSizeFromDescriptor( const GpuStreamDescriptor& desc );
static size_t CalculateBufferSizeFromDescriptor( const GpuStreamDescriptor& desc );

//GpuDownloadBuffer CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
//GpuDownloadBuffer CreateDownloadBuffer( const size_t size, bool dryRun = false );
GpuDownloadBuffer CreateDirectDownloadBuffer( size_t size, IAllocator& devAllocator, size_t alignment, bool dryRun = false );
GpuDownloadBuffer CreateDownloadBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
GpuDownloadBuffer CreateDownloadBuffer( size_t size, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );

GpuDownloadBuffer CreateDownloadBuffer( const GpuStreamDescriptor& desc, bool dryRun = false );

/// Create with descriptor and override entry size
inline GpuDownloadBuffer CreateDownloadBuffer( const GpuStreamDescriptor& desc, size_t entrySize, bool dryRun = false )
{
GpuStreamDescriptor copy = desc;
copy.entrySize = entrySize;

return CreateDownloadBuffer( copy, dryRun );
}

template<typename T>
inline GpuDownloadBuffer CreateDownloadBufferT( const GpuStreamDescriptor& desc, bool dryRun = false )
{
return CreateDownloadBuffer( desc, sizeof( T ), dryRun );
}

/// Create with descriptor and override entry size
GpuUploadBuffer CreateUploadBuffer( const GpuStreamDescriptor& desc, bool dryRun = false );

// inline GpuUploadBuffer CreateUploadBuffer( const GpuStreamDescriptor& desc, bool size_t entrySize, bool dryRun = false )
// {
// GpuStreamDescriptor copy = desc;
// copy.entrySize = entrySize;

// return CreateUploadBuffer( copy, dryRun );
// }

template<typename T>
inline GpuUploadBuffer CreateUploadBufferT( const GpuStreamDescriptor& desc, bool dryRun = false )
{
GpuStreamDescriptor copy = desc;
copy.entrySize = sizeof(T);

return CreateUploadBuffer( copy, dryRun );
// return CreateUploadBuffer( desc, sizeof( T ), dryRun );
}


template<typename T>
inline GpuDownloadBuffer CreateDirectDownloadBuffer( const size_t count, IAllocator& devAllocator, size_t alignment = alignof( T ), bool dryRun = false )
{
return CreateDirectDownloadBuffer( count * sizeof( T ), devAllocator, alignment, dryRun );
}

template<typename T>
inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
{
return CreateDownloadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
}

template<typename T>
inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
{
return CreateDownloadBuffer( count * sizeof( T ), bufferCount, devAllocator, pinnedAllocator, alignment, dryRun );
}

//GpuUploadBuffer CreateUploadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
//GpuUploadBuffer CreateUploadBuffer( const size_t size, bool dryRun = false );
GpuUploadBuffer CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );

template<typename T>
inline GpuUploadBuffer CreateUploadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false )
{
return CreateUploadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
}

inline cudaStream_t GetStream() const { return _stream; }

protected:

struct IGpuBuffer* CreateGpuBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun );
struct IGpuBuffer* CreateGpuBuffer( const GpuStreamDescriptor& desc, bool dryRun );

void DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, cudaEvent_t lockEvent, cudaEvent_t completedEvent );

static void CopyPendingDownloadStream( void* userData );

[[nodiscard]]
Command& GetCommand( CommandType type );
void SubmitCommands();

// Copy threads
static void QueueThreadEntryPoint( GpuQueue* self );
void QueueThreadMain();

void ExecuteCommand( const Command& cpy );

bool ShouldExitQueueThread();

protected:
cudaStream_t _stream = nullptr;
cudaStream_t _preloadStream = nullptr;
cudaStream_t _callbackStream = nullptr;


Thread _queueThread;
//Fence _bufferReadySignal;
Semaphore _bufferReadySignal;
Fence _bufferCopiedSignal;
Fence _syncFence;
SPCQueue<Command, BBCU_BUCKET_COUNT*6> _queue;
Kind _kind;

AutoResetSignal _waitForExitSignal;
std::atomic<bool> _exitQueueThread = false;

// Support multiple threads to grab commands
std::atomic<uint64> _cmdTicketOut = 0;
std::atomic<uint64> _cmdTicketIn = 0;
std::atomic<uint64> _commitTicketOut = 0;
std::atomic<uint64> _commitTicketIn = 0;
};
1,044 changes: 173 additions & 871 deletions cuda/GpuStreams.cu

Large diffs are not rendered by default.

285 changes: 147 additions & 138 deletions cuda/GpuStreams.h
Original file line number Diff line number Diff line change
@@ -5,22 +5,127 @@
#include "threading/Fence.h"
#include "threading/Semaphore.h"
#include "util/SPCQueue.h"
#include "util/StackAllocator.h"
#include <functional>

//#define GPU_BUFFER_COUNT
class DiskBufferBase;
class DiskBuffer;
class DiskBucketBuffer;
struct GpuDownloadBuffer;
struct GpuUploadBuffer;
struct GpuQueue;

typedef std::function<void()> GpuStreamCallback;
typedef void (*GpuDownloadCallback)( void* hostBuffer, size_t downloadSize, void* userData );

struct PackedCopy
{
struct IGpuBuffer* self;
const byte* src;
uint32 sequence;
uint32 length;
uint32 stride;
uint32 elementSize;
uint32 counts[BBCU_BUCKET_COUNT];
};

struct DiskDataInfo
{
DiskBufferBase* diskBuffer;

union {
struct {
GpuUploadBuffer* self;
uint32 sequence;
} uploadInfo;

struct {
size_t srcStride;
} download2DInfo;

struct {
size_t size;
} downloadSequentialInfo;
};
};

struct CopyInfo
{
struct IGpuBuffer* self;
uint32 sequence;

const void* srcBuffer;
void* dstBuffer;
size_t width;
size_t height;
size_t dstStride;
size_t srcStride;

// Callback data
GpuDownloadCallback callback;
void* userData;
};

// Represents a double-buffered device buffer, which can be used with a GpuStreamQueue to
// make fast transfers (via intermediate pinned memory)

class IAllocator;

enum class GpuStreamKind : uint32
{
Download = 0,
Upload
};

typedef void (*GpuDownloadCallback)( void* hostBuffer, size_t downloadSize, void* userData );
struct IGpuBuffer
{
size_t size;
uint32 bufferCount; // Number of pinned/device buffers this instance contains
void* deviceBuffer[BBCU_GPU_BUFFER_MAX_COUNT];
void* pinnedBuffer[BBCU_GPU_BUFFER_MAX_COUNT]; // Pinned host buffer


cudaEvent_t pinnedEvent[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals that the pinned buffer is ready for use

union {
cudaEvent_t deviceEvents[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals that the device buffer is ready for use
cudaEvent_t events [BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the device buffer is ready for use
};


union {
cudaEvent_t workEvent [BBCU_GPU_BUFFER_MAX_COUNT]; // Signals that the the work stream is done w/ the device buffer, and it's ready for use
cudaEvent_t readyEvents [BBCU_GPU_BUFFER_MAX_COUNT]; // User must signal this event when the device buffer is ready for download
};
cudaEvent_t completedEvents[BBCU_GPU_BUFFER_MAX_COUNT]; // Signals the buffer is ready for consumption by the device or buffer

// For dispatching host callbacks.
// Each buffer uses its own function?
cudaEvent_t callbackLockEvent;
cudaEvent_t callbackCompletedEvent;

Fence fence; // Signals the pinned buffer is ready for use
Fence copyFence;

cudaEvent_t preloadEvents[BBCU_GPU_BUFFER_MAX_COUNT];


CopyInfo copies[BBCU_BUCKET_COUNT];
// union {
// PackedCopy packedCopeis[BBCU_BUCKET_COUNT]; // For upload buffers
DiskDataInfo diskData[BBCU_BUCKET_COUNT];
// };
// DiskBucketBuffer* diskBucketBuffer = nullptr;

// #TODO: Remove atomic again
uint32 lockSequence; // Index of next buffer to lock
uint32 outgoingSequence; // Index of locked buffer that will be downloaded/uploaded
std::atomic<uint32> completedSequence; // Index of buffer that finished downloading/uploading
std::atomic<uint32> copySequence;

GpuQueue* queue; // Queue associated with this buffer
DiskBufferBase* diskBuffer; // DiskBuffer, is any, used when using disk offload mode.
};



struct GpuDownloadBuffer
{
@@ -79,7 +184,7 @@ struct GpuDownloadBuffer
}

void DownloadWithCallback( void* hostBuffer, size_t size, GpuDownloadCallback callback, void* userData, cudaStream_t workStream = nullptr, bool directOverride = false );

// Performs a direct host-to-pinned buffer copy,
// and then a 2-dimensional copy from pinned buffer to host buffer
// - width : Size in bytes of each row to copy
@@ -98,6 +203,15 @@ struct GpuDownloadBuffer
Download2D( hostBuffer, width * sizeof( T ), height, dstStride * sizeof( T ), srcStride * sizeof( T ), workStream, directOverride );
}

template<typename T>
inline void Download2DWithCallbackT( T* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
GpuDownloadCallback callback, void* userData, cudaStream_t workStream = nullptr, bool directOverride = false )
{
Download2DWithCallback(
hostBuffer, width * sizeof( T ), height, dstStride * sizeof( T ), srcStride * sizeof( T ),
callback, userData, workStream, directOverride );
}

// Performs several gpu-to-pinned downloads, then copies the pinned data as a contiguous buffer
// to the destination host buffer
void DownloadAndPackArray( void* hostBuffer, uint32 length, size_t srcStride, const uint32* counts, uint32 elementSize );
@@ -120,25 +234,37 @@ struct GpuDownloadBuffer

class GpuQueue* GetQueue() const;

DiskBufferBase* GetDiskBuffer() const;
void AssignDiskBuffer( DiskBufferBase* diskBuffer );

void HostCallback( std::function<void()> func );

//private:
struct IGpuBuffer* self;

private:

void PerformDownload2D( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
GpuDownloadCallback postCallback, void* postUserData,
cudaStream_t workStream, bool directOverride );

void PerformDownload( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
GpuDownloadCallback callback, void* userData, cudaStream_t workStream, struct CopyInfo* copy = nullptr );

void GetDownload2DCommand( void* hostBuffer, size_t width, size_t height, size_t dstStride, size_t srcStride,
uint32& outIndex, void*& outPinnedBuffer, const void*& outDevBuffer, GpuDownloadCallback callback = nullptr, void* userData = nullptr );

void CallHostFunctionOnStream( cudaStream_t stream, std::function<void()> func );
};

struct GpuUploadBuffer
{
void Upload( const void* hostBuffer, size_t size, cudaStream_t workStream );
void Upload( const void* hostBuffer, size_t size, cudaStream_t workStream, bool directOverride = false );

template<typename T>
inline void UploadT( const T* hostBuffer, size_t count, cudaStream_t workStream )
inline void UploadT( const T* hostBuffer, size_t count, cudaStream_t workStream, bool directOverride = false )
{
Upload( hostBuffer, count * sizeof( T ), workStream );
Upload( hostBuffer, count * sizeof( T ), workStream, directOverride );
}

void Upload( const void* hostBuffer, size_t size );
@@ -152,7 +278,7 @@ struct GpuUploadBuffer
// Upload the host buffer, then copy the copyBufferSrc to the host buffer. Preloading
// data into that hostBuffer (should be pinned) as soon as it is free so that memory is ready for the next upload.
void UploadAndPreLoad( void* hostBuffer, size_t size, const void* copyBufferSrc, size_t copySize );

template<typename T>
inline void UploadAndPreLoadT( T* hostBuffer, const size_t count, const T* copyBufferSrc, const size_t copyCount )
{
@@ -170,25 +296,22 @@ struct GpuUploadBuffer

void UploadArray( const void* hostBuffer, uint32 length, uint32 elementSize, uint32 srcStrideBytes, uint32 countStride, const uint32* counts );

void UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length,
uint32 elementSize, uint32 srcStride, uint32 countStride, const uint32* counts );

// srcStride here is in element count
template<typename T>
inline void UploadArrayT( const T* hostBuffer, uint32 length, uint32 srcStride, uint32 countStride, const uint32* counts )
{
UploadArray( hostBuffer, length, (uint32)sizeof( T ), srcStride * (uint32)sizeof( T ), countStride, counts );
}


void* GetUploadedDeviceBuffer( cudaStream_t workStream );

template<typename T>
inline T* GetUploadedDeviceBufferT( cudaStream_t workStream ) { return (T*)GetUploadedDeviceBuffer( workStream ); }

// Waits until the earliest buffer has been uploaded to the GPU
// and returns the device buffer.
void* GetUploadedDeviceBuffer();
void* GetUploadedDeviceBuffer( cudaStream_t workStream );

template<typename T>
inline T* GetUploadedDeviceBufferT() { return (T*)GetUploadedDeviceBuffer(); }
inline T* GetUploadedDeviceBufferT( cudaStream_t workStream ) { return (T*)GetUploadedDeviceBuffer( workStream ); }

// #TODO: Pass in the buffer used as a reference so that it can be nullified, for safety.
void ReleaseDeviceBuffer( cudaStream_t workStream );
@@ -205,131 +328,17 @@ struct GpuUploadBuffer

class GpuQueue* GetQueue() const;

void AssignDiskBuffer( DiskBufferBase* diskBuffer );
DiskBufferBase* GetDiskBuffer() const;

void CallHostFunctionOnStream( cudaStream_t stream, std::function<void()> func );


//private:
struct IGpuBuffer* self;

private:
uint32 SynchronizeOutgoingSequence();
void* GetNextPinnedBuffer();
};


class GpuQueue
{
friend struct IGpuBuffer;
friend struct GpuDownloadBuffer;
friend struct GpuUploadBuffer;

enum class CommandType
{
None = 0,
Copy,
Callback,
};

struct Command
{
CommandType type;

union
{
struct CopyInfo* copy;

struct {
GpuDownloadCallback callback;
size_t copySize;
void* dstbuffer;
void* userData;
} callback;
};
};

public:

enum Kind
{
Downloader,
Uploader
};

GpuQueue( Kind kind );
virtual ~GpuQueue();

//void Synchronize();

//GpuDownloadBuffer CreateDownloadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
//GpuDownloadBuffer CreateDownloadBuffer( const size_t size, bool dryRun = false );
GpuDownloadBuffer CreateDirectDownloadBuffer( size_t size, IAllocator& devAllocator, size_t alignment, bool dryRun = false );
GpuDownloadBuffer CreateDownloadBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );
GpuDownloadBuffer CreateDownloadBuffer( size_t size, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );

template<typename T>
inline GpuDownloadBuffer CreateDirectDownloadBuffer( const size_t count, IAllocator& devAllocator, size_t alignment = alignof( T ), bool dryRun = false )
{
return CreateDirectDownloadBuffer( count * sizeof( T ), devAllocator, alignment, dryRun );
}

template<typename T>
inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
{
return CreateDownloadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
}

template<typename T>
inline GpuDownloadBuffer CreateDownloadBufferT( const size_t count, uint32 bufferCount, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment = alignof( T ), bool dryRun = false )
{
return CreateDownloadBuffer( count * sizeof( T ), bufferCount, devAllocator, pinnedAllocator, alignment, dryRun );
}

//GpuUploadBuffer CreateUploadBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size = 0, bool dryRun = false );
//GpuUploadBuffer CreateUploadBuffer( const size_t size, bool dryRun = false );
GpuUploadBuffer CreateUploadBuffer( const size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false );

template<typename T>
inline GpuUploadBuffer CreateUploadBufferT( const size_t count, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun = false )
{
return CreateUploadBuffer( count * sizeof( T ), devAllocator, pinnedAllocator, alignment, dryRun );
}

inline cudaStream_t GetStream() const { return _stream; }

protected:

struct IGpuBuffer* CreateGpuBuffer( size_t size, IAllocator& devAllocator, IAllocator& pinnedAllocator, size_t alignment, bool dryRun );
struct IGpuBuffer* CreateGpuBuffer( size_t size, uint32 bufferCount, IAllocator* devAllocator, IAllocator* pinnedAllocator, size_t alignment, bool dryRun );
//struct IGpuBuffer* CreateGpuBuffer( const size_t size );
//struct IGpuBuffer* CreateGpuBuffer( void* dev0, void* dev1, void* pinned0, void* pinned1, size_t size );

static void CopyPendingDownloadStream( void* userData );

[[nodiscard]]
Command& GetCommand( CommandType type );
void SubmitCommands();

// Copy threads
static void CopyThreadEntryPoint( GpuQueue* self );
virtual void CopyThreadMain();

void ExecuteCommand( const Command& cpy );

bool ShouldExitCopyThread();

protected:
cudaStream_t _stream;
cudaStream_t _preloadStream;
Thread _copyThread;
//Fence _bufferReadySignal;
Semaphore _bufferReadySignal;
Fence _bufferCopiedSignal;
Fence _syncFence;
SPCQueue<Command, BBCU_BUCKET_COUNT*6> _queue;
Kind _kind;

AutoResetSignal _waitForExitSignal;
std::atomic<bool> _exitCopyThread = false;

// Support multiple threads to grab commands
std::atomic<uint64> _cmdTicketOut = 0;
std::atomic<uint64> _cmdTicketIn = 0;
std::atomic<uint64> _commitTicketOut = 0;
std::atomic<uint64> _commitTicketIn = 0;
};
7 changes: 7 additions & 0 deletions cuda/chacha8.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "pos/chacha8.h"
#include "CudaPlotContext.h"
#include "plotting/DiskBucketBuffer.h"

// #TEST
#if _DEBUG
@@ -247,6 +248,12 @@ void GenF1Cuda( CudaK32PlotContext& cx )
cx.metaOut.WaitForCompletion();
cx.yOut .Reset();
cx.metaOut.Reset();

if( cx.cfg.hybrid16Mode )
{
cx.diskContext->yBuffer->Swap();
cx.diskContext->metaBuffer->Swap();
}
}

///
60 changes: 60 additions & 0 deletions extract-version.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Navigate to the script's directory
$scriptPath = Split-Path -Path $MyInvocation.MyCommand.Definition -Parent
Set-Location -Path $scriptPath

# Arguments
$ver_component = $args[0] # The user-specified component from the full version

# Read the version from the file
$version_str = (Get-Content 'VERSION' | Select-Object -First 1 | Out-String).Trim()
$bb_version_suffix = (Get-Content 'VERSION' | Select-Object -Last 1 | Out-String).Trim()
$version_header = 'src\Version.h'

if ($version_str -eq $bb_version_suffix) {
$bb_version_suffix = ""
}

# Prepend a '-' to the suffix, if necessary
if (-Not [string]::IsNullOrEmpty($bb_version_suffix) -and $bb_version_suffix[0] -ne '-') {
$bb_version_suffix = "-$bb_version_suffix"
}

# Parse the major, minor, and revision numbers
$bb_ver_maj, $bb_ver_min, $bb_ver_rev = $version_str -split '\.' | ForEach-Object { $_.Trim() }

# Get the Git commit hash
$bb_git_commit = $env:GITHUB_SHA
if ([string]::IsNullOrEmpty($bb_git_commit)) {
$bb_git_commit = & git rev-parse HEAD
}

if ([string]::IsNullOrEmpty($bb_git_commit)) {
$bb_git_commit = "unknown"
}

# Check if the user wants a specific component
if (-Not [string]::IsNullOrEmpty($ver_component)) {
switch ($ver_component) {
"major" {
Write-Host -NoNewline $bb_ver_maj
}
"minor" {
Write-Host -NoNewline $bb_ver_min
}
"revision" {
Write-Host -NoNewline $bb_ver_rev
}
"suffix" {
Write-Host -NoNewline $bb_version_suffix
}
"commit" {
Write-Host -NoNewline $bb_git_commit
}
default {
Write-Error "Invalid version component '$ver_component'"
exit 1
}
}
exit 0
}

9 changes: 5 additions & 4 deletions src/PlotContext.h
Original file line number Diff line number Diff line change
@@ -8,10 +8,11 @@

struct PlotRequest
{
const byte* plotId; // Id of the plot we want to create
const char* outDir; // Output plot directory
const char* plotFileName; // .plot.tmp file name
const byte* memo; // Plot memo
const byte* plotId; // Id of the plot we want to create
const char* outDir; // Output plot directory
const char* plotFileName; // .plot.tmp file name
const char* plotOutPath; // Full output path for the final .plot.tmp file
const byte* memo; // Plot memo
uint16 memoSize;
bool isFirstPlot;
bool IsFinalPlot;
1 change: 1 addition & 0 deletions src/PlotWriter.h
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
#include "threading/Thread.h"
#include "threading/Semaphore.h"


/**
* Handles writing the final plot to disk
*
12 changes: 12 additions & 0 deletions src/Types.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#pragma once

#include <memory>

typedef uint8_t byte;
typedef uint8_t uint8;
typedef uint16_t uint16;
@@ -67,3 +69,13 @@ typedef uint128_t uint128;
typedef std::chrono::steady_clock::duration Duration;
typedef std::chrono::steady_clock::time_point TimePoint;
typedef std::chrono::nanoseconds NanoSeconds;


template<typename T>
using ptr = std::unique_ptr<T>;

template<typename T>
using sptr = std::shared_ptr<T>;

template<typename T>
using wptr = std::weak_ptr<T>;
Loading

0 comments on commit e9836f8

Please sign in to comment.