Skip to content

Commit

Permalink
Crop - Tensor implementation on hip and host (#83)
Browse files Browse the repository at this point in the history
* Add crop for hip

* Add support for crop

* Add test suite for crop

* Fix #ifdefs

* Remove unused variable

* Remove unused variable

* Revert hip headers

* Remove unnecessary hip includes

* Add two versions of code - slower removal pending

* Remove usage of buffer_copy helpers
  • Loading branch information
r-abishek authored Dec 10, 2021
1 parent 5cfe0d4 commit dbf682b
Show file tree
Hide file tree
Showing 19 changed files with 1,884 additions and 26 deletions.
17 changes: 17 additions & 0 deletions include/rppt_tensor_augmentations.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,23 @@ RppStatus rppt_erode_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt

RppStatus rppt_dilate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u kernelSize, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);

/******************** crop ********************/

// Crop augmentation for a NCHW/NHWC layout tensor

// *param[in] srcPtr source tensor memory
// *param[in] srcDesc source tensor descriptor
// *param[out] dstPtr destination tensor memory
// *param[in] dstDesc destination tensor descriptor
// *param[in] roiTensorSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
// *param[in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
// *returns a RppStatus enumeration.
// *retval RPP_SUCCESS : succesful completion
// *retval RPP_ERROR : Error

RppStatus rppt_crop_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
RppStatus rppt_crop_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);

#ifdef __cplusplus
}
#endif
Expand Down
120 changes: 120 additions & 0 deletions src/include/cpu/rpp_cpu_simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,66 @@ inline RppStatus rpp_store48_f32pln3_to_u8pkd3(Rpp8u *dstPtr, __m128 *p)
return RPP_SUCCESS;
}

inline RppStatus rpp_load48_u8pkd3_to_u8pln3(Rpp8u *srcPtr, __m128i *px)
{
__m128i pxSrc[8];
__m128i pxMask = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15);
__m128i pxMaskRGB = _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15);

pxSrc[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */
pxSrc[1] = _mm_loadu_si128((__m128i *)(srcPtr + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */
pxSrc[2] = _mm_loadu_si128((__m128i *)(srcPtr + 24)); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 */
pxSrc[3] = _mm_loadu_si128((__m128i *)(srcPtr + 36)); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 */
pxSrc[0] = _mm_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */
pxSrc[1] = _mm_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */
pxSrc[2] = _mm_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */
pxSrc[3] = _mm_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */
pxSrc[4] = _mm_unpacklo_epi8(pxSrc[0], pxSrc[1]); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] */
pxSrc[5] = _mm_unpacklo_epi8(pxSrc[2], pxSrc[3]); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] */
pxSrc[6] = _mm_unpackhi_epi8(pxSrc[0], pxSrc[1]); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] */
pxSrc[7] = _mm_unpackhi_epi8(pxSrc[2], pxSrc[3]); /* unpack 8 hi-pixels of pxSrc[2] and pxSrc[3] */
px[0] = _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[4], pxSrc[5]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] to get R01-16 */
px[1] = _mm_shuffle_epi8(_mm_unpackhi_epi8(pxSrc[4], pxSrc[5]), pxMaskRGB); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] to get G01-16 */
px[2] = _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */

return RPP_SUCCESS;
}

inline RppStatus rpp_store48_u8pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m128i *px)
{
_mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
_mm_storeu_si128((__m128i *)dstPtrG, px[1]); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
_mm_storeu_si128((__m128i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */

return RPP_SUCCESS;
}

inline RppStatus rpp_load48_u8pln3_to_u8pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m128i *px)
{
px[0] = _mm_loadu_si128((__m128i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
px[1] = _mm_loadu_si128((__m128i *)srcPtrG); /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
px[2] = _mm_loadu_si128((__m128i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */

return RPP_SUCCESS;
}

inline RppStatus rpp_store48_u8pln3_to_u8pkd3(Rpp8u *dstPtr, __m128i *px)
{
__m128i pxDst[4];
__m128i pxZero = _mm_setzero_si128();
__m128i pxMaskRGBAtoRGB = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
pxDst[0] = _mm_unpacklo_epi8(px[1], pxZero);
pxDst[1] = _mm_unpackhi_epi8(px[1], pxZero);
pxDst[2] = _mm_unpacklo_epi8(px[0], px[2]);
pxDst[3] = _mm_unpackhi_epi8(px[0], px[2]);
_mm_storeu_si128((__m128i *)dstPtr, _mm_shuffle_epi8(_mm_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB)); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */
_mm_storeu_si128((__m128i *)(dstPtr + 12), _mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB)); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */
_mm_storeu_si128((__m128i *)(dstPtr + 24), _mm_shuffle_epi8(_mm_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB)); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */
_mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */

return RPP_SUCCESS;
}

inline RppStatus rpp_load16_u8_to_f32(Rpp8u *srcPtr, __m128 *p)
{
__m128i px[2];
Expand Down Expand Up @@ -502,6 +562,66 @@ inline RppStatus rpp_store48_f32pln3_to_i8pkd3(Rpp8s *dstPtr, __m128 *p)
return RPP_SUCCESS;
}

inline RppStatus rpp_load48_i8pkd3_to_i8pln3(Rpp8s *srcPtr, __m128i *px)
{
__m128i pxSrc[8];
__m128i pxMask = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15);
__m128i pxMaskRGB = _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15);

pxSrc[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */
pxSrc[1] = _mm_loadu_si128((__m128i *)(srcPtr + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */
pxSrc[2] = _mm_loadu_si128((__m128i *)(srcPtr + 24)); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 */
pxSrc[3] = _mm_loadu_si128((__m128i *)(srcPtr + 36)); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 */
pxSrc[0] = _mm_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */
pxSrc[1] = _mm_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */
pxSrc[2] = _mm_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */
pxSrc[3] = _mm_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */
pxSrc[4] = _mm_unpacklo_epi8(pxSrc[0], pxSrc[1]); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] */
pxSrc[5] = _mm_unpacklo_epi8(pxSrc[2], pxSrc[3]); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] */
pxSrc[6] = _mm_unpackhi_epi8(pxSrc[0], pxSrc[1]); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] */
pxSrc[7] = _mm_unpackhi_epi8(pxSrc[2], pxSrc[3]); /* unpack 8 hi-pixels of pxSrc[2] and pxSrc[3] */
px[0] = _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[4], pxSrc[5]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] to get R01-16 */
px[1] = _mm_shuffle_epi8(_mm_unpackhi_epi8(pxSrc[4], pxSrc[5]), pxMaskRGB); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] to get G01-16 */
px[2] = _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */

return RPP_SUCCESS;
}

inline RppStatus rpp_store48_i8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128i *px)
{
_mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
_mm_storeu_si128((__m128i *)dstPtrG, px[1]); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
_mm_storeu_si128((__m128i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */

return RPP_SUCCESS;
}

inline RppStatus rpp_load48_i8pln3_to_i8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m128i *px)
{
px[0] = _mm_loadu_si128((__m128i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
px[1] = _mm_loadu_si128((__m128i *)srcPtrG); /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
px[2] = _mm_loadu_si128((__m128i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */

return RPP_SUCCESS;
}

inline RppStatus rpp_store48_i8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px)
{
__m128i pxDst[4];
__m128i pxZero = _mm_setzero_si128();
__m128i pxMaskRGBAtoRGB = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
pxDst[0] = _mm_unpacklo_epi8(px[1], pxZero); /* unpack 8 lo-pixels of px[1] and pxZero */
pxDst[1] = _mm_unpackhi_epi8(px[1], pxZero); /* unpack 8 hi-pixels of px[1] and pxZero */
pxDst[2] = _mm_unpacklo_epi8(px[0], px[2]); /* unpack 8 lo-pixels of px[0] and px[2] */
pxDst[3] = _mm_unpackhi_epi8(px[0], px[2]); /* unpack 8 hi-pixels of px[0] and px[2] */
_mm_storeu_si128((__m128i *)dstPtr, _mm_shuffle_epi8(_mm_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB)); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */
_mm_storeu_si128((__m128i *)(dstPtr + 12), _mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB)); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */
_mm_storeu_si128((__m128i *)(dstPtr + 24), _mm_shuffle_epi8(_mm_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB)); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */
_mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */

return RPP_SUCCESS;
}

inline RppStatus rpp_load16_i8_to_f32(Rpp8s *srcPtr, __m128 *p)
{
__m128i px[2];
Expand Down
69 changes: 45 additions & 24 deletions src/include/hip/rpp_hip_common.hpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#ifndef RPP_HIP_COMMON_H
#define RPP_HIP_COMMON_H

#include <hip/hip_runtime_api.h>
#include "hip/rpp/handle.hpp"
#include <hip/hip_runtime.h>
#include <hip/hip_ext.h>
#include <hip/hip_fp16.h>
#include <rppdefs.h>
#include <vector>
Expand Down Expand Up @@ -59,6 +58,7 @@ typedef struct d_half24
} d_half24;

// uchar
typedef unsigned char uchar;
typedef struct d_uchar8
{
uchar4 x;
Expand All @@ -79,6 +79,27 @@ enum class RPPTensorDataType
I8,
};

// schar
typedef signed char schar;
typedef struct d_schar4
{
schar x;
schar y;
schar z;
schar w;
} d_schar4;
typedef struct d_schar8
{
d_schar4 x;
d_schar4 y;
} d_schar8;
typedef struct d_schar24
{
d_schar8 x;
d_schar8 y;
d_schar8 z;
} d_schar24;

struct RPPTensorFunctionMetaData
{
RPPTensorDataType _in_type = RPPTensorDataType::U8;
Expand Down Expand Up @@ -170,7 +191,7 @@ __device__ __forceinline__ void rpp_hip_adjust_range(float *dstPtr, d_float8 *su
sum_f8->y = sum_f8->y * (float4) 0.00392157;
}

__device__ __forceinline__ void rpp_hip_adjust_range(signed char *dstPtr, d_float8 *sum_f8)
__device__ __forceinline__ void rpp_hip_adjust_range(schar *dstPtr, d_float8 *sum_f8)
{
sum_f8->x = sum_f8->x - (float4) 128;
sum_f8->y = sum_f8->y - (float4) 128;
Expand Down Expand Up @@ -198,7 +219,7 @@ __device__ __forceinline__ void rpp_hip_adjust_range(float *dstPtr, d_float24 *s
sum_f24->z.y = sum_f24->z.y * (float4) 0.00392157;
}

__device__ __forceinline__ void rpp_hip_adjust_range(signed char *dstPtr, d_float24 *sum_f24)
__device__ __forceinline__ void rpp_hip_adjust_range(schar *dstPtr, d_float24 *sum_f24)
{
sum_f24->x.x = sum_f24->x.x - (float4) 128;
sum_f24->x.y = sum_f24->x.y - (float4) 128;
Expand Down Expand Up @@ -235,10 +256,10 @@ __device__ __forceinline__ uint rpp_hip_pack(float4 src)
__device__ __forceinline__ uint rpp_hip_pack_i8(float4 src)
{
char4 dst_c4;
dst_c4.w = (signed char)(src.w);
dst_c4.z = (signed char)(src.z);
dst_c4.y = (signed char)(src.y);
dst_c4.x = (signed char)(src.x);
dst_c4.w = (schar)(src.w);
dst_c4.z = (schar)(src.z);
dst_c4.y = (schar)(src.y);
dst_c4.x = (schar)(src.x);

return *(uint *)&dst_c4;
}
Expand Down Expand Up @@ -276,22 +297,22 @@ __device__ __forceinline__ float4 rpp_hip_unpack(uint src)

__device__ __forceinline__ float rpp_hip_unpack0(int src)
{
return (float)(signed char)(src & 0xFF);
return (float)(schar)(src & 0xFF);
}

__device__ __forceinline__ float rpp_hip_unpack1(int src)
{
return (float)(signed char)((src >> 8) & 0xFF);
return (float)(schar)((src >> 8) & 0xFF);
}

__device__ __forceinline__ float rpp_hip_unpack2(int src)
{
return (float)(signed char)((src >> 16) & 0xFF);
return (float)(schar)((src >> 16) & 0xFF);
}

__device__ __forceinline__ float rpp_hip_unpack3(int src)
{
return (float)(signed char)((src >> 24) & 0xFF);
return (float)(schar)((src >> 24) & 0xFF);
}

__device__ __forceinline__ float4 rpp_hip_unpack_from_i8(int src)
Expand All @@ -303,7 +324,7 @@ __device__ __forceinline__ float4 rpp_hip_unpack_from_i8(int src)

// I8 to U8 conversions (8 pixels)

__device__ __forceinline__ void rpp_hip_convert8_i8_to_u8(signed char *srcPtr, uchar *dstPtr)
__device__ __forceinline__ void rpp_hip_convert8_i8_to_u8(schar *srcPtr, uchar *dstPtr)
{
int2 *srcPtr_8;
srcPtr_8 = (int2 *)srcPtr;
Expand All @@ -317,7 +338,7 @@ __device__ __forceinline__ void rpp_hip_convert8_i8_to_u8(signed char *srcPtr, u

// I8 to U8 conversions (24 pixels)

__device__ __forceinline__ void rpp_hip_convert24_i8_to_u8(signed char *srcPtr, uchar *dstPtr)
__device__ __forceinline__ void rpp_hip_convert24_i8_to_u8(schar *srcPtr, uchar *dstPtr)
{
d_int6 *srcPtr_24;
srcPtr_24 = (d_int6 *)srcPtr;
Expand Down Expand Up @@ -355,7 +376,7 @@ __device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(float *srcPtr

// I8 loads without layout toggle (8 I8 pixels)

__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(signed char *srcPtr, int srcIdx, d_float8 *src_f8)
__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(schar *srcPtr, int srcIdx, d_float8 *src_f8)
{
int2 src = *((int2 *)(&srcPtr[srcIdx]));
src_f8->x = rpp_hip_unpack_from_i8(src.x);
Expand Down Expand Up @@ -416,7 +437,7 @@ __device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pln3(f

// I8 loads without layout toggle PLN3 to PLN3 (24 I8 pixels)

__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pln3(signed char *srcPtr, int srcIdx, uint increment, d_float24 *src_f24)
__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pln3(schar *srcPtr, int srcIdx, uint increment, d_float24 *src_f24)
{
d_int6 src;

Expand Down Expand Up @@ -531,7 +552,7 @@ __device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(f

// I8 loads with layout toggle PKD3 to PLN3 (24 I8 pixels)

__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(signed char *srcPtr, int srcIdx, d_float24 *src_f24)
__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(schar *srcPtr, int srcIdx, d_float24 *src_f24)
{
d_int6 src = *((d_int6 *)(&srcPtr[srcIdx]));

Expand Down Expand Up @@ -650,7 +671,7 @@ __device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(f

// I8 loads with layout toggle PLN3 to PKD3 (24 I8 pixels)

__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(signed char *srcPtr, int srcIdx, uint increment, d_float24 *src_f24)
__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(schar *srcPtr, int srcIdx, uint increment, d_float24 *src_f24)
{
d_int6 src;

Expand Down Expand Up @@ -738,7 +759,7 @@ __device__ __forceinline__ void rpp_hip_pack_float8_and_store8(float *dstPtr, ui

// I8 stores without layout toggle (8 I8 pixels)

__device__ __forceinline__ void rpp_hip_pack_float8_and_store8(signed char *dstPtr, uint dstIdx, d_float8 *dst_f8)
__device__ __forceinline__ void rpp_hip_pack_float8_and_store8(schar *dstPtr, uint dstIdx, d_float8 *dst_f8)
{
uint2 dst;
dst.x = rpp_hip_pack_i8(dst_f8->x);
Expand Down Expand Up @@ -785,7 +806,7 @@ __device__ __forceinline__ void rpp_hip_pack_float24_pkd3_and_store24_pkd3(float

// I8 stores without layout toggle PKD3 to PKD3 (24 I8 pixels)

__device__ __forceinline__ void rpp_hip_pack_float24_pkd3_and_store24_pkd3(signed char *dstPtr, uint dstIdx, d_float24 *dst_f24)
__device__ __forceinline__ void rpp_hip_pack_float24_pkd3_and_store24_pkd3(schar *dstPtr, uint dstIdx, d_float24 *dst_f24)
{
d_uint6 dst;

Expand Down Expand Up @@ -856,7 +877,7 @@ __device__ __forceinline__ void rpp_hip_pack_float24_pln3_and_store24_pln3(float

// I8 stores without layout toggle PLN3 to PLN3 (24 I8 pixels)

__device__ __forceinline__ void rpp_hip_pack_float24_pln3_and_store24_pln3(signed char *dstPtr, uint dstIdx, uint increment, d_float24 *dst_f24)
__device__ __forceinline__ void rpp_hip_pack_float24_pln3_and_store24_pln3(schar *dstPtr, uint dstIdx, uint increment, d_float24 *dst_f24)
{
d_uint6 dst;

Expand Down Expand Up @@ -962,7 +983,7 @@ __device__ __forceinline__ void rpp_hip_pack_float24_pln3_and_store24_pkd3(float

// I8 stores with layout toggle PLN3 to PKD3 (24 I8 pixels)

__device__ __forceinline__ void rpp_hip_pack_float24_pln3_and_store24_pkd3(signed char *dstPtr, uint dstIdx, d_float24 *dst_f24)
__device__ __forceinline__ void rpp_hip_pack_float24_pln3_and_store24_pkd3(schar *dstPtr, uint dstIdx, d_float24 *dst_f24)
{
d_uint6 dst;

Expand Down Expand Up @@ -1030,7 +1051,7 @@ __device__ __forceinline__ void rpp_hip_load8_to_uchar8(float *srcPtr, int srcId

// I8 loads without layout toggle (8 I8 pixels)

__device__ __forceinline__ void rpp_hip_load8_to_uchar8(signed char *srcPtr, int srcIdx, uchar *src_uchar8)
__device__ __forceinline__ void rpp_hip_load8_to_uchar8(schar *srcPtr, int srcIdx, uchar *src_uchar8)
{
rpp_hip_convert8_i8_to_u8(&srcPtr[srcIdx], src_uchar8);
}
Expand Down Expand Up @@ -1171,7 +1192,7 @@ __device__ __forceinline__ void rpp_hip_load24_pkd3_to_uchar8_pln3(half *srcPtr,

// I8 loads with layout toggle PKD3 to PLN3 (24 I8 pixels)

__device__ __forceinline__ void rpp_hip_load24_pkd3_to_uchar8_pln3(signed char *srcPtr, int srcIdx, uchar **srcPtrs_uchar8)
__device__ __forceinline__ void rpp_hip_load24_pkd3_to_uchar8_pln3(schar *srcPtr, int srcIdx, uchar **srcPtrs_uchar8)
{
d_uchar24 src_uchar24;
rpp_hip_convert24_i8_to_u8(&srcPtr[srcIdx], (uchar *)&src_uchar24);
Expand Down
Loading

0 comments on commit dbf682b

Please sign in to comment.