From b3760d83a4d4ea07a331f95a19e6d08295d42389 Mon Sep 17 00:00:00 2001
From: Matt Sarett <msarett@google.com>
Date: Wed, 13 Jul 2016 16:43:05 -0400
Subject: [PATCH] Cherry-pick of "Fix AArch64 ABI conformance issue in SIMD
 code"

In the AArch64 ABI, the high (unused) DWORD of a 32-bit argument's
register is undefined, so it was incorrect to use 64-bit
instructions to transfer a JDIMENSION argument in the 64-bit NEON SIMD
functions.  The code worked thus far only because the existing compiler
optimizers weren't smart enough to do anything else with the register in
question, so the upper 32 bits happened to be all zeroes.

The latest builds of Clang/LLVM have a smarter optimizer, and under
certain circumstances, it will attempt to load-combine adjacent 32-bit
integers from one of the libjpeg structures into a single 64-bit integer
and pass that 64-bit integer as a 32-bit argument to one of the SIMD
functions (which is allowed by the ABI, since the upper 32 bits of the
32-bit argument's register are undefined.)  This caused the
libjpeg-turbo regression tests to crash.

This patch tries to use the Wn registers whenever possible.  Otherwise,
it uses a zero-extend instruction to avoid using the upper 32 bits of
the 64-bit registers, which are not guaranteed to be valid for 32-bit
arguments.

Based on sebpop@1fbae13

Closes #91.  Refer also to android-ndk/ndk#110 and
https://llvm.org/bugs/show_bug.cgi?id=28393

BUG:31780857

Change-Id: Id80143ac13ba8d427196daf04f00be2214f85c86
---
 README.android          |  4 ++++
 simd/jsimd_arm64_neon.S | 36 ++++++++++++++++++++++++++++--------
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/README.android b/README.android
index cdbf200..1445551 100644
--- a/README.android
+++ b/README.android
@@ -32,3 +32,7 @@ It is unclear why the unmodified code from upstream appends an underscore
 to name.  Before removing the underscore, the code failed to link because
 the function names in the SIMD code did not match the callers (because of
 the extra underscore).
+
+(5) Cherry picked fix for Arm64 assembly from upstream.
+
+https://github.com/libjpeg-turbo/libjpeg-turbo/commit/1120ff29a178ee666504f0067e7c079a6b792296
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index 4be22e8..0a0e3c8 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -237,6 +237,11 @@ asm_function jsimd_idct_islow_neon
     TMP3            .req x2
     TMP4            .req x15
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     ROW0L           .req v16
     ROW0R           .req v17
     ROW1L           .req v18
@@ -794,6 +799,11 @@ asm_function jsimd_idct_ifast_neon
     TMP4            .req x22
     TMP5            .req x23
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     /* Load and dequantize coefficients into NEON registers
      * with the following allocation:
      *       0 1 2 3 | 4 5 6 7
@@ -1167,6 +1177,11 @@ asm_function jsimd_idct_4x4_neon
     TMP3            .req x2
     TMP4            .req x15
 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
     /* Save all used NEON registers */
     sub             sp, sp, 272
     str             x15, [sp], 16
@@ -1362,6 +1377,12 @@ asm_function jsimd_idct_2x2_neon
     TMP1            .req x0
     TMP2            .req x15
 
+   /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
+
     /* vpush           {v8.4h - v15.4h}            ; not available */
     sub             sp, sp, 208
     str             x15, [sp], 16
@@ -1709,11 +1730,11 @@ Ljsimd_ycc_\colorid\()_neon_consts:
     .short          -128,  -128,   -128,   -128
 
 asm_function jsimd_ycc_\colorid\()_convert_neon
-    OUTPUT_WIDTH    .req x0
+    OUTPUT_WIDTH    .req w0
     INPUT_BUF       .req x1
-    INPUT_ROW       .req x2
+    INPUT_ROW       .req w2
     OUTPUT_BUF      .req x3
-    NUM_ROWS        .req x4
+    NUM_ROWS        .req w4
 
     INPUT_BUF0      .req x5
     INPUT_BUF1      .req x6
@@ -1723,7 +1744,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
     Y               .req x8
     U               .req x9
     V               .req x10
-    N               .req x15
+    N               .req w15
 
     sub             sp, sp, 336
     str             x15, [sp], 16
@@ -1760,11 +1781,10 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
     cmp             NUM_ROWS, #1
     b.lt            9f
 0:
-    lsl             x16, INPUT_ROW, #3
-    ldr             Y, [INPUT_BUF0, x16]
-    ldr             U, [INPUT_BUF1, x16]
+    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
+    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
     mov             N, OUTPUT_WIDTH
-    ldr             V, [INPUT_BUF2, x16]
+    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
     add             INPUT_ROW, INPUT_ROW, #1
     ldr             RGB, [OUTPUT_BUF], #8