From 705915ee007d79e6137d7ed6b54617146af87d97 Mon Sep 17 00:00:00 2001 From: MCJack123 Date: Thu, 27 Jun 2024 19:07:36 -0400 Subject: [PATCH] Attempted to fix Floyd-Steinberg dithering on GPU --- 32vid-player-mini.lua | 18 +++++++++--------- src/cc-pixel.cpp | 40 ++++++++++++++++++++++++++++++++-------- src/quantize.cpp | 19 +++++++++++++++---- 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/32vid-player-mini.lua b/32vid-player-mini.lua index 16270d4..0f6a4c0 100644 --- a/32vid-player-mini.lua +++ b/32vid-player-mini.lua @@ -13,7 +13,7 @@ end if file.read(4) ~= "32VD" then file.close() error("Not a 32Vid file") end local width, height, fps, nstreams, flags = (" 3000 then sleep(0) lastyield = os.epoch "utc" end local dcstart = os.epoch "utc" @@ -179,9 +179,9 @@ for _ = 1, nframes do else delete[#delete+1] = i end end for i, v in ipairs(delete) do table.remove(subs, v - i + 1) end - term.setCursorPos(1, height + 1) - term.clearLine() - print("Frame decode time:", dctime, "ms") + --term.setCursorPos(1, height + 1) + --term.clearLine() + --print("Frame decode time:", dctime, "ms") vframe = vframe + 1 elseif ftype == 1 then local audio = file.read(size) @@ -203,7 +203,7 @@ for _ = 1, nframes do elseif ftype >= 0x40 and ftype < 0x80 then if ftype == 64 then vframe = vframe + 1 end local mx, my = bit32_band(bit32_rshift(ftype, 3), 7) + 1, bit32_band(ftype, 7) + 1 - print("(" .. mx .. ", " .. my .. ")") + --print("(" .. mx .. ", " .. my .. ")") local term = monitors[my][mx] if os.epoch "utc" - lastyield > 3000 then sleep(0) lastyield = os.epoch "utc" end local width, height = ("= width) progress[id] = width + 2; - else if (id == 0 || progress[id-1] >= x + 2) { + __private uint id; + __private ulong yoff; + + if (get_local_id(0) == 0) { + workgroup_number = atomic_inc(workgroup_rider); + for (int i = 0; i < get_local_size(0); i++) progress[i] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + id = workgroup_number * get_local_size(0) + get_local_id(0); + yoff = id*width; + for (x = 0; x < width;) { + if (get_local_id(0) == 0 ? workgroup_number == 0 || workgroup_progress[workgroup_number-1] >= x + 2 : progress[get_local_id(0)-1] >= x + 2) { if (id < height) { pix.x = image[(yoff+x)*3]; pix.y = image[(yoff+x)*3+1]; pix.z = image[(yoff+x)*3+2]; pix += vload3(yoff + x, error); @@ -436,10 +457,13 @@ __kernel void floydSteinbergDither(__global const uchar * image, __global uchar if (x > 0) vstore3(vload3(yoff + width + x - 1, error) + (err * 0.125f), yoff + width + x - 1, error); vstore3(vload3(yoff + width + x, error) + (err * 0.1875f), yoff + width + x, error); } - x++; progress[id]++; + if (get_local_id(0) == get_local_size(0) - 1) workgroup_progress[workgroup_number] = x; + else progress[get_local_id(0)] = x; + x++; } } - progress[id] = width + 2; + if (get_local_id(0) == get_local_size(0) - 1) workgroup_progress[workgroup_number] = width + 2; + else progress[get_local_id(0)] = width + 2; } static __constant int thresholdMap[8][8] = { diff --git a/src/quantize.cpp b/src/quantize.cpp index 04d455b..5412dc1 100644 --- a/src/quantize.cpp +++ b/src/quantize.cpp @@ -553,16 +553,27 @@ Mat ditherImage(Mat& image, const std::vector& palette, OpenCL::Device * Mat retval(image.width, image.height, device); #ifdef HAS_OPENCL if (device != NULL && false) { - ulong progress_size = image.height / WORKGROUP_SIZE * WORKGROUP_SIZE + (image.height % WORKGROUP_SIZE ? WORKGROUP_SIZE : 0); + ulong progress_size = image.height / WORKGROUP_SIZE + (image.height % WORKGROUP_SIZE ? 1 : 0); OpenCL::Memory palette_mem(*device, palette.size(), 3); for (int i = 0; i < palette.size(); i++) {palette_mem[i*3] = palette[i][0]; palette_mem[i*3+1] = palette[i][1]; palette_mem[i*3+2] = palette[i][2];} OpenCL::Memory error(*device, image.width * (image.height + 1) * 3, 1, false, true); - OpenCL::Memory progress(*device, progress_size, 1, false, true); + OpenCL::Memory workgroup_rider(*device, 1, 1, false, true); + OpenCL::Memory workgroup_progress(*device, progress_size, 1, false, true); + OpenCL::Memory progress(*device, WORKGROUP_SIZE, 1, false, true); device->get_cl_queue().enqueueFillBuffer(error.get_cl_buffer(), 0.0f, 0, image.width * (image.height + 1) * 3 * sizeof(float)); - device->get_cl_queue().enqueueFillBuffer(progress.get_cl_buffer(), 0u, 0, progress_size * sizeof(uint)); palette_mem.enqueue_write_to_device(); image.upload(); - OpenCL::Kernel kernel(*device, image.height, "floydSteinbergDither", *image.mem, *retval.mem, palette_mem, (uchar)palette.size(), error, progress, (ulong)image.width, (ulong)image.height); + OpenCL::Kernel kernel( + *device, image.height, "floydSteinbergDither", + *image.mem, + *retval.mem, + palette_mem, + (uchar)palette.size(), + error, + workgroup_rider, + workgroup_progress, + progress, + (ulong)image.width, (ulong)image.height); kernel.run(); retval.onHost = false; retval.onDevice = true;