Skip to content

Commit

Permalink
fix crash when compiling with Clang for AArch64
Browse files Browse the repository at this point in the history
This patch fixes android/ndk#110
The crash seemed to come from a change in LLVM and was reported to the LLVM
bugzilla https://llvm.org/bugs/show_bug.cgi?id=28393

Upon investigation, LLVM optimized away the zeroing of the upper bits of the x2
register before calling color_convert.  The convention is for the callee to do
any truncation needed.  The patch tries to use the Wn registers whenever
possible, and otherwise use a zero-extend instruction to avoid using the
random information in the upper 32 bits of the 64 bit registers.
  • Loading branch information
Sebastian Pop committed Jul 12, 2016
1 parent 6e9d43e commit 1fbae13
Showing 1 changed file with 32 additions and 13 deletions.
45 changes: 32 additions & 13 deletions simd/jsimd_arm64_neon.S
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,10 @@ asm_function jsimd_idct_islow_neon
TMP7 .req x13
TMP8 .req x14

/* OUTPUT_COL is a JDIMENSION that is an unsigned int: zero extend x3 to
avoid having random bits set in the upper part of this 64bit register. */
uxtw x3, w3

sub sp, sp, #64
adr x15, Ljsimd_idct_islow_neon_consts
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
Expand Down Expand Up @@ -807,6 +811,10 @@ asm_function jsimd_idct_ifast_neon
TMP7 .req x13
TMP8 .req x14

/* OUTPUT_COL is a JDIMENSION that is an unsigned int: zero extend x3 to
avoid having random bits set in the upper part of this 64bit register. */
uxtw x3, w3

/* Load and dequantize coefficients into NEON registers
* with the following allocation:
* 0 1 2 3 | 4 5 6 7
Expand Down Expand Up @@ -1101,6 +1109,10 @@ asm_function jsimd_idct_4x4_neon
TMP3 .req x2
TMP4 .req x15

/* OUTPUT_COL is a JDIMENSION that is an unsigned int: zero extend x3 to
avoid having random bits set in the upper part of this 64bit register. */
uxtw x3, w3

/* Save all used NEON registers */
sub sp, sp, 272
str x15, [sp], 16
Expand Down Expand Up @@ -1299,6 +1311,10 @@ asm_function jsimd_idct_2x2_neon
TMP1 .req x0
TMP2 .req x15

/* OUTPUT_COL is a JDIMENSION that is an unsigned int: zero extend x3 to
avoid having random bits set in the upper part of this 64bit register. */
uxtw x3, w3

/* vpush {v8.4h - v15.4h} ; not available */
sub sp, sp, 208
str x15, [sp], 16
Expand Down Expand Up @@ -1688,11 +1704,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
.else
asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
.endif
OUTPUT_WIDTH .req x0
OUTPUT_WIDTH .req w0
INPUT_BUF .req x1
INPUT_ROW .req x2
INPUT_ROW .req w2
OUTPUT_BUF .req x3
NUM_ROWS .req x4
NUM_ROWS .req w4

INPUT_BUF0 .req x5
INPUT_BUF1 .req x6
Expand All @@ -1702,7 +1718,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
Y .req x8
U .req x9
V .req x10
N .req x15
N .req w15

sub sp, sp, 336
str x15, [sp], 16
Expand Down Expand Up @@ -1745,11 +1761,10 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
cmp NUM_ROWS, #1
b.lt 9f
0:
lsl x16, INPUT_ROW, #3
ldr Y, [INPUT_BUF0, x16]
ldr U, [INPUT_BUF1, x16]
ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
mov N, OUTPUT_WIDTH
ldr V, [INPUT_BUF2, x16]
ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
add INPUT_ROW, INPUT_ROW, #1
ldr RGB, [OUTPUT_BUF], #8

Expand Down Expand Up @@ -2054,8 +2069,8 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
OUTPUT_WIDTH .req w0
INPUT_BUF .req x1
OUTPUT_BUF .req x2
OUTPUT_ROW .req x3
NUM_ROWS .req x4
OUTPUT_ROW .req w3
NUM_ROWS .req w4

OUTPUT_BUF0 .req x5
OUTPUT_BUF1 .req x6
Expand Down Expand Up @@ -2089,10 +2104,10 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
cmp NUM_ROWS, #1
b.lt 9f
0:
ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
mov N, OUTPUT_WIDTH
ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
add OUTPUT_ROW, OUTPUT_ROW, #1
ldr RGB, [INPUT_BUF], #8

Expand Down Expand Up @@ -2199,6 +2214,10 @@ asm_function jsimd_convsamp_neon
TMP8 .req x4
TMPDUP .req w3

/* Zero-extend the low 32bit word in x1 to follow the declaration of
the current function: START_COL is a JDIMENSION that is an unsigned int. */
uxtw x1, w1

mov TMPDUP, #128
ldp TMP1, TMP2, [SAMPLE_DATA], 16
ldp TMP3, TMP4, [SAMPLE_DATA], 16
Expand Down

0 comments on commit 1fbae13

Please sign in to comment.