Skip to content

Commit

Permalink
Fix AArch64 ABI conformance issue in SIMD code.
Browse files Browse the repository at this point in the history
In the AArch64 ABI, the high (unused) DWORD of a 32-bit argument's
register is undefined, so it was incorrect to use 64-bit instructions
to transfer a JDIMENSION argument in the 64-bit NEON SIMD functions.
The code worked thus far only because the existing compiler optimizers
weren't smart enough to do anything else with the register in question,
so the upper 32 bits happened to be all zeroes.

The latest builds of Clang/LLVM have a smarter optimizer, and under
certain circumstances, it will attempt to load-combine adjacent 32-bit
integers from one of the libjpeg structures into a single 64-bit integer
and pass that 64-bit integer as a 32-bit argument to one of the SIMD
functions (which is allowed by the ABI, since the upper 32 bits of the
32-bit argument's register are undefined.)  This caused the
libjpeg-turbo regression tests to crash.

This patch tries to use the Wn registers whenever possible.  Otherwise,
it uses a zero-extend instruction to avoid using the upper 32 bits of
the 64-bit registers, which are not guaranteed to be valid for 32-bit
arguments.

Based on sebpop@1fbae13

Closes #91.  Refer also to android/ndk#110 and
https://llvm.org/bugs/show_bug.cgi?id=28393

(cherrypick from external/libjpeg-turbo of c8282be605a139b269f2b8f4b4fb6f4118e740ab.)

Change-Id: I935a3696106e7ffcf9b1a6c12cb7f31d95b4ccc8
  • Loading branch information
enh-google authored and Inventor1938 committed Sep 20, 2016
1 parent e295fa4 commit 42f1d7b
Showing 1 changed file with 49 additions and 15 deletions.
64 changes: 49 additions & 15 deletions jsimd_arm64_neon.S
Original file line number Diff line number Diff line change
Expand Up @@ -203,14 +203,33 @@ asm_function jsimd_idct_islow_neon
OUTPUT_COL .req x3
TMP1 .req x0
TMP2 .req x1
TMP3 .req x9
TMP4 .req x10
TMP5 .req x11
TMP6 .req x12
TMP7 .req x13
TMP8 .req x14
TMP3 .req x2
TMP4 .req x15

sub sp, sp, #64
/* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
guarantee that the upper (unused) 32 bits of x3 are valid. This
instruction ensures that those bits are set to zero. */
uxtw x3, w3

ROW0L .req v16
ROW0R .req v17
ROW1L .req v18
ROW1R .req v19
ROW2L .req v20
ROW2R .req v21
ROW3L .req v22
ROW3R .req v23
ROW4L .req v24
ROW4R .req v25
ROW5L .req v26
ROW5R .req v27
ROW6L .req v28
ROW6R .req v29
ROW7L .req v30
ROW7R .req v31
/* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
sub sp, sp, 272
str x15, [sp], 16
adr x15, Ljsimd_idct_islow_neon_consts
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
Expand Down Expand Up @@ -807,6 +826,11 @@ asm_function jsimd_idct_ifast_neon
TMP7 .req x13
TMP8 .req x14

/* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
guarantee that the upper (unused) 32 bits of x3 are valid. This
instruction ensures that those bits are set to zero. */
uxtw x3, w3

/* Load and dequantize coefficients into NEON registers
* with the following allocation:
* 0 1 2 3 | 4 5 6 7
Expand Down Expand Up @@ -1101,6 +1125,11 @@ asm_function jsimd_idct_4x4_neon
TMP3 .req x2
TMP4 .req x15

/* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
guarantee that the upper (unused) 32 bits of x3 are valid. This
instruction ensures that those bits are set to zero. */
uxtw x3, w3

/* Save all used NEON registers */
sub sp, sp, 272
str x15, [sp], 16
Expand Down Expand Up @@ -1299,6 +1328,12 @@ asm_function jsimd_idct_2x2_neon
TMP1 .req x0
TMP2 .req x15

/* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
guarantee that the upper (unused) 32 bits of x3 are valid. This
instruction ensures that those bits are set to zero. */
uxtw x3, w3


/* vpush {v8.4h - v15.4h} ; not available */
sub sp, sp, 208
str x15, [sp], 16
Expand Down Expand Up @@ -1684,11 +1719,11 @@ Ljsimd_ycc_\colorid\()_neon_consts:
.short -128, -128, -128, -128

asm_function jsimd_ycc_\colorid\()_convert_neon
OUTPUT_WIDTH .req x0
OUTPUT_WIDTH .req w0
INPUT_BUF .req x1
INPUT_ROW .req x2
INPUT_ROW .req w2
OUTPUT_BUF .req x3
NUM_ROWS .req x4
NUM_ROWS .req w4

INPUT_BUF0 .req x5
INPUT_BUF1 .req x6
Expand All @@ -1698,7 +1733,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
Y .req x8
U .req x9
V .req x10
N .req x15
N .req w15

sub sp, sp, 336
str x15, [sp], 16
Expand Down Expand Up @@ -1737,11 +1772,10 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
cmp NUM_ROWS, #1
b.lt 9f
0:
lsl x16, INPUT_ROW, #3
ldr Y, [INPUT_BUF0, x16]
ldr U, [INPUT_BUF1, x16]
ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
mov N, OUTPUT_WIDTH
ldr V, [INPUT_BUF2, x16]
ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
add INPUT_ROW, INPUT_ROW, #1
ldr RGB, [OUTPUT_BUF], #8

Expand Down

0 comments on commit 42f1d7b

Please sign in to comment.