Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[vulkan] Revert "Device API Multi-streams, multi-queue, and initial multi-thread support" #2822

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions taichi/backends/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,28 +25,25 @@ void Device::memcpy(DevicePtr dst, DevicePtr src, uint64_t size) {
void GraphicsDevice::image_transition(DeviceAllocation img,
ImageLayout old_layout,
ImageLayout new_layout) {
Stream *stream = get_graphics_stream();
auto cmd_list = stream->new_command_list();
auto cmd_list = new_command_list({CommandListType::Graphics});
cmd_list->image_transition(img, old_layout, new_layout);
stream->submit_synced(cmd_list.get());
submit_synced(cmd_list.get());
}
void GraphicsDevice::buffer_to_image(DeviceAllocation dst_img,
DevicePtr src_buf,
ImageLayout img_layout,
const BufferImageCopyParams &params) {
Stream *stream = get_graphics_stream();
auto cmd_list = stream->new_command_list();
auto cmd_list = new_command_list({CommandListType::Graphics});
cmd_list->buffer_to_image(dst_img, src_buf, img_layout, params);
stream->submit_synced(cmd_list.get());
submit_synced(cmd_list.get());
}
void GraphicsDevice::image_to_buffer(DevicePtr dst_buf,
DeviceAllocation src_img,
ImageLayout img_layout,
const BufferImageCopyParams &params) {
Stream *stream = get_graphics_stream();
auto cmd_list = stream->new_command_list();
auto cmd_list = new_command_list({CommandListType::Graphics});
cmd_list->image_to_buffer(dst_buf, src_img, img_layout, params);
stream->submit_synced(cmd_list.get());
submit_synced(cmd_list.get());
}

} // namespace lang
Expand Down
30 changes: 14 additions & 16 deletions taichi/backends/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,12 @@ class Pipeline {
virtual ResourceBinder *resource_binder() = 0;
};

enum class CommandListType { Graphics, Compute };

struct CommandListConfig {
CommandListType type;
};

enum class ImageDimension { d1D, d2D, d3D };

enum class ImageLayout {
Expand Down Expand Up @@ -321,18 +327,6 @@ inline bool operator&(AllocUsage a, AllocUsage b) {
return static_cast<int>(a) & static_cast<int>(b);
}

class Stream {
public:
virtual ~Stream(){};

virtual std::unique_ptr<CommandList> new_command_list() = 0;
virtual void dealloc_command_list(CommandList *cmdlist) = 0;
virtual void submit(CommandList *cmdlist) = 0;
virtual void submit_synced(CommandList *cmdlist) = 0;

virtual void command_sync() = 0;
};

class Device {
public:
virtual ~Device(){};
Expand Down Expand Up @@ -384,8 +378,14 @@ class Device {
// Copy memory inter or intra devices (synced)
static void memcpy(DevicePtr dst, DevicePtr src, uint64_t size);

// Each thraed will acquire its own stream
virtual Stream *get_compute_stream() = 0;
// TODO: Add a flag to select graphics / compute pool
virtual std::unique_ptr<CommandList> new_command_list(
CommandListConfig config) = 0;
virtual void dealloc_command_list(CommandList *cmdlist) = 0;
virtual void submit(CommandList *cmdlist) = 0;
virtual void submit_synced(CommandList *cmdlist) = 0;

virtual void command_sync() = 0;

private:
std::unordered_map<DeviceCapability, uint32_t> caps_;
Expand Down Expand Up @@ -448,8 +448,6 @@ class GraphicsDevice : public Device {
const std::vector<VertexInputAttribute> &vertex_attrs,
std::string name = "Pipeline") = 0;

virtual Stream *get_graphics_stream() = 0;

virtual std::unique_ptr<Surface> create_surface(
const SurfaceConfig &config) = 0;
virtual DeviceAllocation create_image(const ImageParams &params) = 0;
Expand Down
112 changes: 35 additions & 77 deletions taichi/backends/vulkan/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,22 +133,6 @@ class HostDeviceContextBlitter {
return;
}

bool require_sync = ctx_attribs_->rets().size() > 0;
if (!require_sync) {
for (int i = 0; i < ctx_attribs_->args().size(); ++i) {
const auto &arg = ctx_attribs_->args()[i];
if (arg.is_array) {
require_sync = true;
}
}
}

if (require_sync) {
device_->get_compute_stream()->command_sync();
} else {
return;
}

char *const device_base =
reinterpret_cast<char *>(device_->map(*host_shadow_buffer_));

Expand Down Expand Up @@ -247,9 +231,8 @@ class CompiledTaichiKernel {
};

CompiledTaichiKernel(const Params &ti_params)
: ti_kernel_attribs_(*ti_params.ti_kernel_attribs),
device_(ti_params.device) {
input_buffers_ = {
: ti_kernel_attribs_(*ti_params.ti_kernel_attribs) {
InputBuffersMap input_buffers = {
{BufferEnum::Root, ti_params.root_buffer},
{BufferEnum::GlobalTmps, ti_params.global_tmps_buffer},
};
Expand All @@ -258,27 +241,44 @@ class CompiledTaichiKernel {
Device::AllocParams params;
ctx_buffer_ = ti_params.device->allocate_memory_unique(
{size_t(ctx_sz),
/*host_write=*/true, /*host_read=*/false,
/*export_sharing=*/false, AllocUsage::Storage});
/*host_write*/ true, /*host_read*/ false});
ctx_buffer_host_ = ti_params.device->allocate_memory_unique(
{size_t(ctx_sz),
/*host_write=*/false, /*host_read=*/true,
/*export_sharing=*/false, AllocUsage::Storage});
input_buffers_[BufferEnum::Context] = ctx_buffer_.get();
/*host_write*/ false, /*host_read*/ true});
input_buffers[BufferEnum::Context] = ctx_buffer_.get();
}

const auto &task_attribs = ti_kernel_attribs_.tasks_attribs;
const auto &spirv_bins = ti_params.spirv_bins;
TI_ASSERT(task_attribs.size() == spirv_bins.size());

cmdlist_ = ti_params.device->new_command_list({CommandListType::Compute});
for (int i = 0; i < task_attribs.size(); ++i) {
const auto &attribs = task_attribs[i];
PipelineSourceDesc source_desc{PipelineSourceType::spirv_binary,
(void *)spirv_bins[i].data(),
spirv_bins[i].size() * sizeof(uint32_t)};
auto vp = ti_params.device->create_pipeline(source_desc,
ti_kernel_attribs_.name);
const int group_x = (attribs.advisory_total_num_threads +
attribs.advisory_num_threads_per_group - 1) /
attribs.advisory_num_threads_per_group;
ResourceBinder *binder = vp->resource_binder();
for (auto &pair : input_buffers) {
binder->rw_buffer(0, uint32_t(pair.first), *pair.second);
}
cmdlist_->bind_pipeline(vp.get());
cmdlist_->bind_resources(binder);
cmdlist_->dispatch(group_x);
cmdlist_->memory_barrier();
pipelines_.push_back(std::move(vp));
}

if (!ti_kernel_attribs_.ctx_attribs.empty()) {
cmdlist_->buffer_copy(ctx_buffer_host_->get_ptr(0),
ctx_buffer_->get_ptr(0), ctx_sz);
cmdlist_->buffer_barrier(*ctx_buffer_host_);
}
}

const TaichiKernelAttributes &ti_kernel_attribs() const {
Expand All @@ -297,40 +297,12 @@ class CompiledTaichiKernel {
return ctx_buffer_host_.get();
}

void command_list(CommandList *cmdlist) const {
const auto &task_attribs = ti_kernel_attribs_.tasks_attribs;

for (int i = 0; i < task_attribs.size(); ++i) {
const auto &attribs = task_attribs[i];
auto vp = pipelines_[i].get();
const int group_x = (attribs.advisory_total_num_threads +
attribs.advisory_num_threads_per_group - 1) /
attribs.advisory_num_threads_per_group;
ResourceBinder *binder = vp->resource_binder();
for (auto &pair : input_buffers_) {
binder->rw_buffer(0, uint32_t(pair.first), *pair.second);
}
cmdlist->bind_pipeline(vp);
cmdlist->bind_resources(binder);
cmdlist->dispatch(group_x);
cmdlist->memory_barrier();
}

const auto ctx_sz = ti_kernel_attribs_.ctx_attribs.total_bytes();
if (!ti_kernel_attribs_.ctx_attribs.empty()) {
cmdlist->buffer_copy(ctx_buffer_host_->get_ptr(0),
ctx_buffer_->get_ptr(0), ctx_sz);
cmdlist->buffer_barrier(*ctx_buffer_host_);
}
CommandList *command_list() const {
return cmdlist_.get();
}

private:
TaichiKernelAttributes ti_kernel_attribs_;
std::vector<TaskAttributes> tasks_attribs_;

Device *device_;

InputBuffersMap input_buffers_;

// Right now |ctx_buffer_| is allocated from a HOST_VISIBLE|COHERENT
// memory, because we do not do computation on this buffer anyway, and it may
Expand All @@ -340,6 +312,8 @@ class CompiledTaichiKernel {
std::unique_ptr<DeviceAllocationGuard> ctx_buffer_{nullptr};
std::unique_ptr<DeviceAllocationGuard> ctx_buffer_host_{nullptr};
std::vector<std::unique_ptr<Pipeline>> pipelines_;

std::unique_ptr<CommandList> cmdlist_;
};

} // namespace
Expand Down Expand Up @@ -402,22 +376,15 @@ class VkRuntime ::Impl {
ctx_blitter->host_to_device();
}

if (!current_cmdlist_) {
current_cmdlist_ = device_->get_compute_stream()->new_command_list();
}

ti_kernel->command_list(current_cmdlist_.get());

device_->submit(ti_kernel->command_list());
if (ctx_blitter) {
device_->get_compute_stream()->submit(current_cmdlist_.get());
synchronize();
ctx_blitter->device_to_host();

current_cmdlist_ = nullptr;
}
}

void synchronize() {
device_->get_compute_stream()->command_sync();
device_->command_sync();
}

Device *get_ti_device() const {
Expand All @@ -430,23 +397,16 @@ class VkRuntime ::Impl {
size_t root_buffer_size = 64 * 1024 * 1024;
size_t gtmp_buffer_size = 1024 * 1024;

root_buffer_ = device_->allocate_memory_unique(
{root_buffer_size,
/*host_write=*/false, /*host_read=*/false,
/*export_sharing=*/false, AllocUsage::Storage});
global_tmps_buffer_ = device_->allocate_memory_unique(
{gtmp_buffer_size,
/*host_write=*/false, /*host_read=*/false,
/*export_sharing=*/false, AllocUsage::Storage});
root_buffer_ = device_->allocate_memory_unique({root_buffer_size});
global_tmps_buffer_ = device_->allocate_memory_unique({gtmp_buffer_size});

// Need to zero fill the buffers, otherwise there could be NaN.
Stream *stream = device_->get_compute_stream();
auto cmdlist = stream->new_command_list();
auto cmdlist = device_->new_command_list({CommandListType::Compute});
cmdlist->buffer_fill(root_buffer_->get_ptr(0), root_buffer_size,
/*data=*/0);
cmdlist->buffer_fill(global_tmps_buffer_->get_ptr(0), gtmp_buffer_size,
/*data=*/0);
stream->submit_synced(cmdlist.get());
device_->submit_synced(cmdlist.get());
}

const SNodeDescriptorsMap *const snode_descriptors_;
Expand All @@ -459,8 +419,6 @@ class VkRuntime ::Impl {

Device *device_;

std::unique_ptr<CommandList> current_cmdlist_{nullptr};

std::vector<std::unique_ptr<CompiledTaichiKernel>> ti_kernels_;
};

Expand Down
8 changes: 0 additions & 8 deletions taichi/backends/vulkan/spirv_ir_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,14 +314,6 @@ SType IRBuilder::get_struct_array_type(const SType &value_type,
TI_ERROR("buffer type must be primitive or snode struct");
}

if (nbytes == 0) {
if (value_type.flag == TypeKind::kPrimitive) {
TI_WARN("Invalid primitive bit size");
} else {
TI_WARN("Invalid container stride");
}
}

// decorate the array type
this->decorate(spv::OpDecorate, arr_type, spv::DecorationArrayStride, nbytes);
// declare struct of array
Expand Down
Loading