From a12554c583020f021b2df36ea95f4ad63de2be20 Mon Sep 17 00:00:00 2001
From: melonedo <44501064+melonedo@users.noreply.github.com>
Date: Tue, 24 Sep 2024 21:48:53 +0800
Subject: [PATCH] Support encoding for padded 444_U8_P012 images

---
 libgpujpeg/gpujpeg_common.h |  2 ++
 src/gpujpeg_common.c        |  4 +++-
 src/gpujpeg_preprocessor.cu | 34 +++++++++++++++++++++++-----------
 src/main.c                  |  2 +-
 4 files changed, 29 insertions(+), 13 deletions(-)
diff --git a/libgpujpeg/gpujpeg_common.h b/libgpujpeg/gpujpeg_common.h
index 8f006efc..1cc120e8 100644
--- a/libgpujpeg/gpujpeg_common.h
+++ b/libgpujpeg/gpujpeg_common.h
@@ -275,6 +275,8 @@ struct gpujpeg_image_parameters {
     enum gpujpeg_color_space color_space;
     /// Image data sampling factor
     enum gpujpeg_pixel_format pixel_format;
+    /// Number of bytes padded to each row
+    int width_padding;
 };
 
 /**
diff --git a/src/gpujpeg_common.c b/src/gpujpeg_common.c
index 44b55fc9..51070ed5 100644
--- a/src/gpujpeg_common.c
+++ b/src/gpujpeg_common.c
@@ -343,6 +343,7 @@ gpujpeg_image_set_default_parameters(struct gpujpeg_image_parameters* param)
     param->height = 0;
     param->color_space = GPUJPEG_RGB;
     param->pixel_format = GPUJPEG_444_U8_P012;
+    param->width_padding = 0;
 }
 
 struct gpujpeg_image_parameters
@@ -362,7 +363,8 @@ gpujpeg_image_parameters_equals(const struct gpujpeg_image_parameters *p1 , cons
     return p1->width == p2->width &&
         p1->height == p2->height &&
         p1->color_space == p2->color_space &&
-        p1->pixel_format == p2->pixel_format;
+        p1->pixel_format == p2->pixel_format &&
+        p1->width_padding == p2->width_padding;
 }
 
 /* Documented at declaration */
diff --git a/src/gpujpeg_preprocessor.cu b/src/gpujpeg_preprocessor.cu
index a39b799d..2cbfeca8 100644
--- a/src/gpujpeg_preprocessor.cu
+++ b/src/gpujpeg_preprocessor.cu
@@ -85,6 +85,9 @@ struct gpujpeg_preprocessor_raw_to_comp_store {
 template<enum gpujpeg_pixel_format>
 inline __device__ void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r);
 
+template<enum gpujpeg_pixel_format>
+constexpr int __device__ unit_size() { return 1; }
+
 template<>
 inline __device__ void raw_to_comp_load<GPUJPEG_U8>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
 {
@@ -118,18 +121,22 @@ inline __device__ void raw_to_comp_load<GPUJPEG_420_U8_P0P1P2>(const uint8_t* d_
 }
 
 template<>
-inline __device__ void raw_to_comp_load<GPUJPEG_444_U8_P012>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+constexpr int __device__ unit_size<GPUJPEG_444_U8_P012>() { return 3; }
+
+template<>
+inline __device__ void raw_to_comp_load<GPUJPEG_444_U8_P012>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)
 {
-    const unsigned int offset = image_position * 3;
     r.x = d_data_raw[offset];
     r.y = d_data_raw[offset + 1];
     r.z = d_data_raw[offset + 2];
 }
 
 template<>
-inline __device__ void raw_to_comp_load<GPUJPEG_4444_U8_P0123>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+constexpr int __device__ unit_size<GPUJPEG_4444_U8_P0123>() { return 4; }
+
+template<>
+inline __device__ void raw_to_comp_load<GPUJPEG_4444_U8_P0123>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)
 {
-    const unsigned int offset = image_position * 4;
     r.x = d_data_raw[offset];
     r.y = d_data_raw[offset + 1];
     r.z = d_data_raw[offset + 2];
@@ -137,11 +144,13 @@ inline __device__ void raw_to_comp_load<GPUJPEG_4444_U8_P0123>(const uint8_t* d_
 }
 
 template<>
-inline __device__ void raw_to_comp_load<GPUJPEG_422_U8_P1020>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+constexpr int __device__ unit_size<GPUJPEG_422_U8_P1020>() { return 2; }
+
+template<>
+inline __device__ void raw_to_comp_load<GPUJPEG_422_U8_P1020>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &offset, int &x, int &y, uchar4 &r)
 {
-    const unsigned int offset = image_position * 2;
     r.x = d_data_raw[offset + 1];
-    if ( image_position % 2 == 0 ) {
+    if ( offset % 4 == 0 ) {
         r.y = d_data_raw[offset];
         r.z = d_data_raw[offset + 2];
     } else {
@@ -153,7 +162,7 @@ inline __device__ void raw_to_comp_load<GPUJPEG_422_U8_P1020>(const uint8_t* d_d
 /**
  * Kernel - Copy raw image source data into three separated component buffers
  */
-typedef void (*gpujpeg_preprocessor_encode_kernel)(struct gpujpeg_preprocessor_data data, const uint8_t* d_data_raw, const uint8_t* d_data_raw_end, int image_width, int image_height, uint32_t width_div_mul, uint32_t width_div_shift);
+typedef void (*gpujpeg_preprocessor_encode_kernel)(struct gpujpeg_preprocessor_data data, const uint8_t* d_data_raw, int image_width_padding, int image_width, int image_height, uint32_t width_div_mul, uint32_t width_div_shift);
 
 /**
  * @note
@@ -171,7 +180,7 @@ template<
     uint8_t s_comp4_samp_factor_h, uint8_t s_comp4_samp_factor_v
 >
 __global__ void
-gpujpeg_preprocessor_raw_to_comp_kernel(struct gpujpeg_preprocessor_data data, const uint8_t* d_data_raw, const uint8_t* d_data_raw_end, int image_width, int image_height, uint32_t width_div_mul, uint32_t width_div_shift)
+gpujpeg_preprocessor_raw_to_comp_kernel(struct gpujpeg_preprocessor_data data, const uint8_t* d_data_raw, int image_width_padding, int image_width, int image_height, uint32_t width_div_mul, uint32_t width_div_shift)
 {
     int x  = threadIdx.x;
     int gX = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x;
@@ -187,7 +196,8 @@ gpujpeg_preprocessor_raw_to_comp_kernel(struct gpujpeg_preprocessor_data data, c
 
     // Load
     uchar4 r;
-    raw_to_comp_load<pixel_format>(d_data_raw, image_width, image_height, image_position, image_position_x, image_position_y, r);
+    int offset = image_position * unit_size<pixel_format>() + image_width_padding * image_position_y;
+    raw_to_comp_load<pixel_format>(d_data_raw, image_width, image_height, offset, image_position_x, image_position_y, r);
 
     // Color transform
     gpujpeg_color_transform<color_space, color_space_internal>::perform(r);
@@ -399,7 +409,7 @@ gpujpeg_preprocessor_encode_interlaced(struct gpujpeg_encoder * encoder)
     kernel<<<grid, threads, 0, encoder->stream>>>(
         data,
         coder->d_data_raw,
-        coder->d_data_raw + coder->data_raw_size,
+        coder->param_image.width_padding,
         image_width,
         image_height,
         width_div_mul,
@@ -456,6 +466,8 @@ int
 gpujpeg_preprocessor_encode(struct gpujpeg_encoder * encoder)
 {
     struct gpujpeg_coder * coder = &encoder->coder;
+    /// @todo support padding for other formats
+    assert(!coder->param_image.width_padding || (coder->param_image.pixel_format == GPUJPEG_444_U8_P012 && coder->preprocessor));
     if (coder->preprocessor) {
             return gpujpeg_preprocessor_encode_interlaced(encoder);
     } else {
diff --git a/src/main.c b/src/main.c
index 8dc10a1c..a2820342 100644
--- a/src/main.c
+++ b/src/main.c
@@ -173,7 +173,7 @@ adjust_params(struct gpujpeg_parameters* param, struct gpujpeg_image_parameters*
               const char* out, bool encode, const struct options* opts)
 {
     // if possible, read properties from file
-    struct gpujpeg_image_parameters file_param_image = { 0, 0, GPUJPEG_NONE, GPUJPEG_PIXFMT_NONE };
+    struct gpujpeg_image_parameters file_param_image = { 0, 0, GPUJPEG_NONE, GPUJPEG_PIXFMT_NONE, 0 };
     const char *raw_file = encode ? in : out;
     gpujpeg_image_get_properties(raw_file, &file_param_image, encode);
     param_image->width = USE_IF_NOT_NULL_ELSE(param_image->width, file_param_image.width);