From bdbe869e9981143d9e288118b0b64476aaf1e0c4 Mon Sep 17 00:00:00 2001
From: Martin Pulec <martin.pulec@cesnet.cz>
Date: Thu, 15 Feb 2024 13:22:00 +0100
Subject: [PATCH] workaround for possibly unsupported instruction by ZLUDA

In theory this can have impact on performance but there was no measurable
difference after this change. This may be perhaps caused because the
bottleneck are transfers from/to global memory and thus the computation
duration is masked by the transfers.

closes GH-90
---
 src/gpujpeg_dct_gpu.cu | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/gpujpeg_dct_gpu.cu b/src/gpujpeg_dct_gpu.cu
index 8592c512..6635fc9a 100644
--- a/src/gpujpeg_dct_gpu.cu
+++ b/src/gpujpeg_dct_gpu.cu
@@ -602,8 +602,14 @@ gpujpeg_idct_gpu_kernel(int16_t* source, uint8_t* result, int output_stride, uin
 		//cast float to uint8_t with saturation (.sat) which cuts values higher than 
 		//255 to 255 and smaller than 0 to 0; cuda can't use a reg smaller than 32b 
 		//(though it can convert to 8b for the saturation purposes and save to 32b reg)
-		uint32_t save;
-		asm("cvt.rni.u8.f32.sat	%0, %1;" : "=r"(save) : "f"(x[i] + ((float) 128.0)));
+		// uint32_t save;
+		// asm("cvt.rni.u8.f32.sat	%0, %1;" : "=r"(save) : "f"(x[i] + ((float) 128.0)));
+
+		// Following wokaround enables GPUJPEG with ZLUDA (see GH-90). May be slower
+		// but not measurable because perhaps the computation time is masked by global
+		// memory transfers.
+		int save = rintf(x[i] + 128.0F);
+		save = save < 0 ? 0 : save > 255 ? 255 : save;
 		((uint8_t*) tempResultP)[i] = save;
 	}