diff --git a/src/gpujpeg_dct_gpu.cu b/src/gpujpeg_dct_gpu.cu
index 8592c512..6635fc9a 100644
--- a/src/gpujpeg_dct_gpu.cu
+++ b/src/gpujpeg_dct_gpu.cu
@@ -602,8 +602,14 @@ gpujpeg_idct_gpu_kernel(int16_t* source, uint8_t* result, int output_stride, uin
 		//cast float to uint8_t with saturation (.sat) which cuts values higher than 
 		//255 to 255 and smaller than 0 to 0; cuda can't use a reg smaller than 32b 
 		//(though it can convert to 8b for the saturation purposes and save to 32b reg)
-		uint32_t save;
-		asm("cvt.rni.u8.f32.sat	%0, %1;" : "=r"(save) : "f"(x[i] + ((float) 128.0)));
+		// uint32_t save;
+		// asm("cvt.rni.u8.f32.sat	%0, %1;" : "=r"(save) : "f"(x[i] + ((float) 128.0)));
+
+		// Following wokaround enables GPUJPEG with ZLUDA (see GH-90). May be slower
+		// but not measurable because perhaps the computation time is masked by global
+		// memory transfers.
+		int save = rintf(x[i] + 128.0F);
+		save = save < 0 ? 0 : save > 255 ? 255 : save;
 		((uint8_t*) tempResultP)[i] = save;
 	}