diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S index 74d6c76e8d..9ed5292ff9 100644 --- a/simd/jsimd_arm64_neon.S +++ b/simd/jsimd_arm64_neon.S @@ -210,6 +210,10 @@ asm_function jsimd_idct_islow_neon TMP7 .req x13 TMP8 .req x14 + /* OUTPUT_COL is a JDIMENSION that is an unsigned int: zero extend x3 to + avoid having random bits set in the upper part of this 64bit register. */ + uxtw x3, w3 + sub sp, sp, #64 adr x15, Ljsimd_idct_islow_neon_consts st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 @@ -807,6 +811,10 @@ asm_function jsimd_idct_ifast_neon TMP7 .req x13 TMP8 .req x14 + /* OUTPUT_COL is a JDIMENSION that is an unsigned int: zero extend x3 to + avoid having random bits set in the upper part of this 64bit register. */ + uxtw x3, w3 + /* Load and dequantize coefficients into NEON registers * with the following allocation: * 0 1 2 3 | 4 5 6 7 @@ -1101,6 +1109,10 @@ asm_function jsimd_idct_4x4_neon TMP3 .req x2 TMP4 .req x15 + /* OUTPUT_COL is a JDIMENSION that is an unsigned int: zero extend x3 to + avoid having random bits set in the upper part of this 64bit register. */ + uxtw x3, w3 + /* Save all used NEON registers */ sub sp, sp, 272 str x15, [sp], 16 @@ -1299,6 +1311,10 @@ asm_function jsimd_idct_2x2_neon TMP1 .req x0 TMP2 .req x15 + /* OUTPUT_COL is a JDIMENSION that is an unsigned int: zero extend x3 to + avoid having random bits set in the upper part of this 64bit register. */ + uxtw x3, w3 + /* vpush {v8.4h - v15.4h} ; not available */ sub sp, sp, 208 str x15, [sp], 16 @@ -1688,11 +1704,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon .else asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 .endif - OUTPUT_WIDTH .req x0 + OUTPUT_WIDTH .req w0 INPUT_BUF .req x1 - INPUT_ROW .req x2 + INPUT_ROW .req w2 OUTPUT_BUF .req x3 - NUM_ROWS .req x4 + NUM_ROWS .req w4 INPUT_BUF0 .req x5 INPUT_BUF1 .req x6 @@ -1702,7 +1718,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 Y .req x8 U .req x9 V .req x10 - N .req x15 + N .req w15 sub sp, sp, 336 str x15, [sp], 16 @@ -1745,11 +1761,10 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 cmp NUM_ROWS, #1 b.lt 9f 0: - lsl x16, INPUT_ROW, #3 - ldr Y, [INPUT_BUF0, x16] - ldr U, [INPUT_BUF1, x16] + ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #3] + ldr U, [INPUT_BUF1, INPUT_ROW, lsl #3] mov N, OUTPUT_WIDTH - ldr V, [INPUT_BUF2, x16] + ldr V, [INPUT_BUF2, INPUT_ROW, lsl #3] add INPUT_ROW, INPUT_ROW, #1 ldr RGB, [OUTPUT_BUF], #8 @@ -2054,8 +2069,8 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 OUTPUT_WIDTH .req w0 INPUT_BUF .req x1 OUTPUT_BUF .req x2 - OUTPUT_ROW .req x3 - NUM_ROWS .req x4 + OUTPUT_ROW .req w3 + NUM_ROWS .req w4 OUTPUT_BUF0 .req x5 OUTPUT_BUF1 .req x6 @@ -2199,6 +2214,10 @@ asm_function jsimd_convsamp_neon TMP8 .req x4 TMPDUP .req w3 + /* Zero-extend the low 32bit word in x1 to follow the declaration of + the current function: START_COL is a JDIMENSION that is an unsigned int. */ + uxtw x1, w1 + mov TMPDUP, #128 ldp TMP1, TMP2, [SAMPLE_DATA], 16 ldp TMP3, TMP4, [SAMPLE_DATA], 16