Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Vulkan] Device API Multi-streams, multi-queue, and initial multi-thread support #2802

Merged
merged 27 commits into from
Aug 27, 2021
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
14a44c5
Move to reference counted residency tracking & preparing for a multi …
bobcao3 Aug 25, 2021
384cdcf
Multi stream & multi threading ready
bobcao3 Aug 25, 2021
037e16c
Auto Format
taichi-gardener Aug 25, 2021
e1001e7
remove command list config
bobcao3 Aug 26, 2021
5b345ac
Merge branch 'device-api-streams' of https://github.com/bobcao3/taich…
bobcao3 Aug 26, 2021
b437bac
Accumulate into one command list until sync point
bobcao3 Aug 26, 2021
15345e6
Auto Format
taichi-gardener Aug 25, 2021
e30c2d3
Accumulate into one command list until sync point
bobcao3 Aug 26, 2021
f4e6740
Remove exposed mutex from ref counted pool
bobcao3 Aug 26, 2021
17f07d9
Merge branch 'device-api-streams' of https://github.com/bobcao3/taich…
bobcao3 Aug 26, 2021
40c31d7
Move ref counted pool to commons
bobcao3 Aug 26, 2021
f4d0eb4
Auto Format
taichi-gardener Aug 26, 2021
fabde65
Move to reference counted residency tracking & preparing for a multi …
bobcao3 Aug 25, 2021
bbd665d
Multi stream & multi threading ready
bobcao3 Aug 25, 2021
597ba12
remove command list config
bobcao3 Aug 26, 2021
0cca6f3
Auto Format
taichi-gardener Aug 25, 2021
a3137ae
Accumulate into one command list until sync point
bobcao3 Aug 26, 2021
e7d9a3a
Remove exposed mutex from ref counted pool
bobcao3 Aug 26, 2021
428d7e1
Move ref counted pool to commons
bobcao3 Aug 26, 2021
1817c77
Merge branch 'master' of https://github.com/taichi-dev/taichi into de…
bobcao3 Aug 26, 2021
78e6456
merge ggui updates
bobcao3 Aug 26, 2021
41e4fb0
merge
bobcao3 Aug 26, 2021
c4464a7
namespace
bobcao3 Aug 26, 2021
42c2615
fix something
bobcao3 Aug 26, 2021
a1fb99d
Move `get_graphics_stream` to graphics device
bobcao3 Aug 26, 2021
8b045cd
Always create compute queue
bobcao3 Aug 27, 2021
42df3ce
Auto Format
taichi-gardener Aug 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions taichi/backends/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,28 @@ void Device::memcpy(DevicePtr dst, DevicePtr src, uint64_t size) {
void GraphicsDevice::image_transition(DeviceAllocation img,
ImageLayout old_layout,
ImageLayout new_layout) {
auto cmd_list = new_command_list({CommandListType::Graphics});
Stream *stream = get_graphics_stream();
auto cmd_list = stream->new_command_list();
cmd_list->image_transition(img, old_layout, new_layout);
submit_synced(cmd_list.get());
stream->submit_synced(cmd_list.get());
}
void GraphicsDevice::buffer_to_image(DeviceAllocation dst_img,
DevicePtr src_buf,
ImageLayout img_layout,
const BufferImageCopyParams &params) {
auto cmd_list = new_command_list({CommandListType::Graphics});
Stream *stream = get_graphics_stream();
auto cmd_list = stream->new_command_list();
cmd_list->buffer_to_image(dst_img, src_buf, img_layout, params);
submit_synced(cmd_list.get());
stream->submit_synced(cmd_list.get());
}
void GraphicsDevice::image_to_buffer(DevicePtr dst_buf,
DeviceAllocation src_img,
ImageLayout img_layout,
const BufferImageCopyParams &params) {
auto cmd_list = new_command_list({CommandListType::Graphics});
Stream *stream = get_graphics_stream();
auto cmd_list = stream->new_command_list();
cmd_list->image_to_buffer(dst_buf, src_img, img_layout, params);
submit_synced(cmd_list.get());
stream->submit_synced(cmd_list.get());
}

} // namespace lang
Expand Down
29 changes: 15 additions & 14 deletions taichi/backends/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,12 +203,6 @@ class Pipeline {
virtual ResourceBinder *resource_binder() = 0;
};

enum class CommandListType { Graphics, Compute };

struct CommandListConfig {
CommandListType type;
};

enum class ImageDimension { d1D, d2D, d3D };

enum class ImageLayout {
Expand Down Expand Up @@ -327,6 +321,18 @@ inline bool operator&(AllocUsage a, AllocUsage b) {
return static_cast<int>(a) & static_cast<int>(b);
}

class Stream {
public:
virtual ~Stream(){};

virtual std::unique_ptr<CommandList> new_command_list() = 0;
virtual void dealloc_command_list(CommandList *cmdlist) = 0;
virtual void submit(CommandList *cmdlist) = 0;
virtual void submit_synced(CommandList *cmdlist) = 0;

virtual void command_sync() = 0;
};

class Device {
public:
virtual ~Device(){};
Expand Down Expand Up @@ -378,14 +384,9 @@ class Device {
// Copy memory inter or intra devices (synced)
static void memcpy(DevicePtr dst, DevicePtr src, uint64_t size);

// TODO: Add a flag to select graphics / compute pool
virtual std::unique_ptr<CommandList> new_command_list(
CommandListConfig config) = 0;
virtual void dealloc_command_list(CommandList *cmdlist) = 0;
virtual void submit(CommandList *cmdlist) = 0;
virtual void submit_synced(CommandList *cmdlist) = 0;

virtual void command_sync() = 0;
// Each thraed will acquire its own stream
virtual Stream *get_compute_stream() = 0;
virtual Stream *get_graphics_stream() = 0;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: only declare this for GraphicsDevice..?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right


private:
std::unordered_map<DeviceCapability, uint32_t> caps_;
Expand Down
112 changes: 77 additions & 35 deletions taichi/backends/vulkan/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,22 @@ class HostDeviceContextBlitter {
return;
}

bool require_sync = ctx_attribs_->rets().size() > 0;
if (!require_sync) {
for (int i = 0; i < ctx_attribs_->args().size(); ++i) {
const auto &arg = ctx_attribs_->args()[i];
if (arg.is_array) {
require_sync = true;
}
}
}

if (require_sync) {
device_->get_compute_stream()->command_sync();
} else {
return;
}

char *const device_base =
reinterpret_cast<char *>(device_->map(*host_shadow_buffer_));

Expand Down Expand Up @@ -231,8 +247,9 @@ class CompiledTaichiKernel {
};

CompiledTaichiKernel(const Params &ti_params)
: ti_kernel_attribs_(*ti_params.ti_kernel_attribs) {
InputBuffersMap input_buffers = {
: ti_kernel_attribs_(*ti_params.ti_kernel_attribs),
device_(ti_params.device) {
input_buffers_ = {
{BufferEnum::Root, ti_params.root_buffer},
{BufferEnum::GlobalTmps, ti_params.global_tmps_buffer},
};
Expand All @@ -241,44 +258,27 @@ class CompiledTaichiKernel {
Device::AllocParams params;
ctx_buffer_ = ti_params.device->allocate_memory_unique(
{size_t(ctx_sz),
/*host_write*/ true, /*host_read*/ false});
/*host_write=*/true, /*host_read=*/false,
/*export_sharing=*/false, AllocUsage::Storage});
ctx_buffer_host_ = ti_params.device->allocate_memory_unique(
{size_t(ctx_sz),
/*host_write*/ false, /*host_read*/ true});
input_buffers[BufferEnum::Context] = ctx_buffer_.get();
/*host_write=*/false, /*host_read=*/true,
/*export_sharing=*/false, AllocUsage::Storage});
input_buffers_[BufferEnum::Context] = ctx_buffer_.get();
}

const auto &task_attribs = ti_kernel_attribs_.tasks_attribs;
const auto &spirv_bins = ti_params.spirv_bins;
TI_ASSERT(task_attribs.size() == spirv_bins.size());

cmdlist_ = ti_params.device->new_command_list({CommandListType::Compute});
for (int i = 0; i < task_attribs.size(); ++i) {
const auto &attribs = task_attribs[i];
PipelineSourceDesc source_desc{PipelineSourceType::spirv_binary,
(void *)spirv_bins[i].data(),
spirv_bins[i].size() * sizeof(uint32_t)};
auto vp = ti_params.device->create_pipeline(source_desc,
ti_kernel_attribs_.name);
const int group_x = (attribs.advisory_total_num_threads +
attribs.advisory_num_threads_per_group - 1) /
attribs.advisory_num_threads_per_group;
ResourceBinder *binder = vp->resource_binder();
for (auto &pair : input_buffers) {
binder->rw_buffer(0, uint32_t(pair.first), *pair.second);
}
cmdlist_->bind_pipeline(vp.get());
cmdlist_->bind_resources(binder);
cmdlist_->dispatch(group_x);
cmdlist_->memory_barrier();
pipelines_.push_back(std::move(vp));
}

if (!ti_kernel_attribs_.ctx_attribs.empty()) {
cmdlist_->buffer_copy(ctx_buffer_host_->get_ptr(0),
ctx_buffer_->get_ptr(0), ctx_sz);
cmdlist_->buffer_barrier(*ctx_buffer_host_);
}
}

const TaichiKernelAttributes &ti_kernel_attribs() const {
Expand All @@ -297,12 +297,40 @@ class CompiledTaichiKernel {
return ctx_buffer_host_.get();
}

CommandList *command_list() const {
return cmdlist_.get();
void command_list(CommandList *cmdlist) const {
const auto &task_attribs = ti_kernel_attribs_.tasks_attribs;

for (int i = 0; i < task_attribs.size(); ++i) {
const auto &attribs = task_attribs[i];
auto vp = pipelines_[i].get();
const int group_x = (attribs.advisory_total_num_threads +
attribs.advisory_num_threads_per_group - 1) /
attribs.advisory_num_threads_per_group;
ResourceBinder *binder = vp->resource_binder();
for (auto &pair : input_buffers_) {
binder->rw_buffer(0, uint32_t(pair.first), *pair.second);
}
cmdlist->bind_pipeline(vp);
cmdlist->bind_resources(binder);
cmdlist->dispatch(group_x);
cmdlist->memory_barrier();
}

const auto ctx_sz = ti_kernel_attribs_.ctx_attribs.total_bytes();
if (!ti_kernel_attribs_.ctx_attribs.empty()) {
cmdlist->buffer_copy(ctx_buffer_host_->get_ptr(0),
ctx_buffer_->get_ptr(0), ctx_sz);
cmdlist->buffer_barrier(*ctx_buffer_host_);
}
}

private:
TaichiKernelAttributes ti_kernel_attribs_;
std::vector<TaskAttributes> tasks_attribs_;

Device *device_;

InputBuffersMap input_buffers_;

// Right now |ctx_buffer_| is allocated from a HOST_VISIBLE|COHERENT
// memory, because we do not do computation on this buffer anyway, and it may
Expand All @@ -312,8 +340,6 @@ class CompiledTaichiKernel {
std::unique_ptr<DeviceAllocationGuard> ctx_buffer_{nullptr};
std::unique_ptr<DeviceAllocationGuard> ctx_buffer_host_{nullptr};
std::vector<std::unique_ptr<Pipeline>> pipelines_;

std::unique_ptr<CommandList> cmdlist_;
};

} // namespace
Expand Down Expand Up @@ -376,15 +402,22 @@ class VkRuntime ::Impl {
ctx_blitter->host_to_device();
}

device_->submit(ti_kernel->command_list());
if (!current_cmdlist_) {
current_cmdlist_ = device_->get_compute_stream()->new_command_list();
}

ti_kernel->command_list(current_cmdlist_.get());

if (ctx_blitter) {
synchronize();
device_->get_compute_stream()->submit(current_cmdlist_.get());
ctx_blitter->device_to_host();

current_cmdlist_ = nullptr;
}
}

void synchronize() {
device_->command_sync();
device_->get_compute_stream()->command_sync();
}

Device *get_ti_device() const {
Expand All @@ -397,16 +430,23 @@ class VkRuntime ::Impl {
size_t root_buffer_size = 64 * 1024 * 1024;
size_t gtmp_buffer_size = 1024 * 1024;

root_buffer_ = device_->allocate_memory_unique({root_buffer_size});
global_tmps_buffer_ = device_->allocate_memory_unique({gtmp_buffer_size});
root_buffer_ = device_->allocate_memory_unique(
{root_buffer_size,
/*host_write=*/false, /*host_read=*/false,
/*export_sharing=*/false, AllocUsage::Storage});
global_tmps_buffer_ = device_->allocate_memory_unique(
{gtmp_buffer_size,
/*host_write=*/false, /*host_read=*/false,
/*export_sharing=*/false, AllocUsage::Storage});

// Need to zero fill the buffers, otherwise there could be NaN.
auto cmdlist = device_->new_command_list({CommandListType::Compute});
Stream *stream = device_->get_compute_stream();
auto cmdlist = stream->new_command_list();
cmdlist->buffer_fill(root_buffer_->get_ptr(0), root_buffer_size,
/*data=*/0);
cmdlist->buffer_fill(global_tmps_buffer_->get_ptr(0), gtmp_buffer_size,
/*data=*/0);
device_->submit_synced(cmdlist.get());
stream->submit_synced(cmdlist.get());
}

const SNodeDescriptorsMap *const snode_descriptors_;
Expand All @@ -419,6 +459,8 @@ class VkRuntime ::Impl {

Device *device_;

std::unique_ptr<CommandList> current_cmdlist_{nullptr};

std::vector<std::unique_ptr<CompiledTaichiKernel>> ti_kernels_;
};

Expand Down
8 changes: 8 additions & 0 deletions taichi/backends/vulkan/spirv_ir_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,14 @@ SType IRBuilder::get_struct_array_type(const SType &value_type,
TI_ERROR("buffer type must be primitive or snode struct");
}

if (nbytes == 0) {
if (value_type.flag == TypeKind::kPrimitive) {
TI_WARN("Invalid primitive bit size");
} else {
TI_WARN("Invalid container stride");
}
}

// decorate the array type
this->decorate(spv::OpDecorate, arr_type, spv::DecorationArrayStride, nbytes);
// declare struct of array
Expand Down
Loading