Merge remote-tracking branch 'upstream/master'

ShiftMediaProject · Oct 1, 2021 · 77bee0c · 77bee0c
2 parents 4ca702d + 66a5bc1
commit 77bee0c
Show file tree

Hide file tree

Showing 31 changed files with 118 additions and 81 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -58,7 +58,7 @@ S: Sweden
 N: Laurent Aimar
 E: fenrir AT videolan DOT org
 C: fenrir
-D: Intial import, former maintainer
+D: Initial import, former maintainer
 D: x86 asm (mmx/mmx2)
 S: France
 

diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
@@ -210,7 +210,7 @@ static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if( (mvy&3) == 3 )             // explict if() to force conditional add
+    if( (mvy&3) == 3 )             // explicit if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
@@ -236,7 +236,7 @@ static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if( (mvy&3) == 3 )             // explict if() to force conditional add
+    if( (mvy&3) == 3 )             // explicit if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */

diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
@@ -218,7 +218,7 @@ static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if( (mvy&3) == 3 )             // explict if() to force conditional add
+    if( (mvy&3) == 3 )             // explicit if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
@@ -244,7 +244,7 @@ static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if( (mvy&3) == 3 )             // explict if() to force conditional add
+    if( (mvy&3) == 3 )             // explicit if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */

diff --git a/common/bitstream.c b/common/bitstream.c
@@ -92,10 +92,10 @@ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
     {
         /* Size doesn't include the size of the header we're writing now. */
         int chunk_size = size - 4;
-        orig_dst[0] = chunk_size >> 24;
-        orig_dst[1] = chunk_size >> 16;
-        orig_dst[2] = chunk_size >> 8;
-        orig_dst[3] = chunk_size >> 0;
+        orig_dst[0] = (uint8_t)(chunk_size >> 24);
+        orig_dst[1] = (uint8_t)(chunk_size >> 16);
+        orig_dst[2] = (uint8_t)(chunk_size >> 8);
+        orig_dst[3] = (uint8_t)(chunk_size >> 0);
     }
 
     nal->i_payload = size;

diff --git a/common/cabac.c b/common/cabac.c
@@ -89,10 +89,10 @@ static inline void cabac_putbyte( x264_cabac_t *cb )
             cb->p[-1] += carry;
             while( bytes_outstanding > 0 )
             {
-                *(cb->p++) = carry-1;
+                *(cb->p++) = (uint8_t)(carry-1);
                 bytes_outstanding--;
             }
-            *(cb->p++) = out;
+            *(cb->p++) = (uint8_t)out;
             cb->i_bytes_outstanding = 0;
         }
     }

diff --git a/common/deblock.c b/common/deblock.c
@@ -655,11 +655,11 @@ void x264_macroblock_deblock( x264_t *h )
     } while( 0 )
 
     if( !transform_8x8 ) FILTER( 0, 1 );
-                         FILTER( 0, 2 );
+    FILTER( 0, 2 );
     if( !transform_8x8 ) FILTER( 0, 3 );
 
     if( !transform_8x8 ) FILTER( 1, 1 );
-                         FILTER( 1, 2 );
+    FILTER( 1, 2 );
     if( !transform_8x8 ) FILTER( 1, 3 );
 
     #undef FILTER

diff --git a/common/macroblock.h b/common/macroblock.h
@@ -361,7 +361,7 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed );
  *      uses all neighbors, even those that didn't end up using this ref.
  *      h->mb. need only valid values from other blocks */
 #define x264_mb_predict_mv_ref16x16 x264_template(mb_predict_mv_ref16x16)
-void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc );
+void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t (*mvc)[2], int *i_mvc );
 
 #define x264_mb_mc x264_template(mb_mc)
 void x264_mb_mc( x264_t *h );

diff --git a/common/mc.h b/common/mc.h
@@ -61,8 +61,8 @@ static void mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t
             if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
                 continue;\
 \
-            unsigned mbx = current[0];\
-            unsigned mby = current[1];\
+            unsigned mbx = (unsigned)current[0];\
+            unsigned mby = (unsigned)current[1];\
             unsigned idx0 = mbx + mby * stride;\
             unsigned idx2 = idx0 + stride;\
 \

diff --git a/common/mvpred.c b/common/mvpred.c
@@ -171,8 +171,8 @@ void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
     int16_t *mv_b  = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
 
     if( i_refa == -2 || i_refb == -2 ||
-        !( i_refa | M32( mv_a ) ) ||
-        !( i_refb | M32( mv_b ) ) )
+        !( (uint32_t)i_refa | M32( mv_a ) ) ||
+        !( (uint32_t)i_refb | M32( mv_b ) ) )
     {
         M32( mv ) = 0;
     }
@@ -304,7 +304,7 @@ static ALWAYS_INLINE int mb_predict_mv_direct16x16_spatial( x264_t *h, int b_int
             mv_c   = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
         }
 
-        int i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc );
+        int i_ref = (int)X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc );
         if( i_ref < 0 )
         {
             i_ref = -1;
@@ -516,7 +516,7 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
 }
 
 /* This just improves encoder performance, it's not part of the spec */
-void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc )
+void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t (*mvc)[2], int *i_mvc )
 {
     int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
     int i = 0;

diff --git a/common/osdep.h b/common/osdep.h
@@ -289,7 +289,7 @@ static inline int x264_is_regular_file( FILE *filehandle )
 #define ALIGNED_8( var )  DECLARE_ALIGNED( var, 8 )
 #define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
 
-// ARM compiliers don't reliably align stack variables
+// ARM compilers don't reliably align stack variables
 // - EABI requires only 8 byte stack alignment to be maintained
 // - gcc can't align stack variables to more even if the stack were to be correctly aligned outside the function
 // - armcc can't either, but is nice enough to actually tell you so

diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
@@ -1262,7 +1262,7 @@ static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1,
     SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
                     sa8d4v, sa8d5v, sa8d6v, sa8d7v );
 
-    /* accumulation of the absolute value of all elements of the resulting bloc */
+    /* accumulation of the absolute value of all elements of the resulting block */
     vec_s16_t abs0v = VEC_ABS(sa8d0v);
     vec_s16_t abs1v = VEC_ABS(sa8d1v);
     vec_s16_t sum01v = vec_add(abs0v, abs1v);

diff --git a/common/rectangle.h b/common/rectangle.h
@@ -28,8 +28,8 @@
 static ALWAYS_INLINE void x264_macroblock_cache_rect( void *dst, int w, int h, int s, uint32_t v )
 {
     uint8_t *d = dst;
-    uint16_t v2 = s == 2 ? v : v * 0x101;
-    uint32_t v4 = s == 4 ? v : s == 2 ? v * 0x10001 : v * 0x1010101;
+    uint16_t v2 = s >= 2 ? v : v * 0x101;
+    uint32_t v4 = s >= 4 ? v : s >= 2 ? v * 0x10001 : v * 0x1010101;
     uint64_t v8 = v4 + ((uint64_t)v4 << 32);
     s *= 8;
 
@@ -142,13 +142,13 @@ static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, in
     else
         x264_macroblock_cache_rect( mvd_cache, width*2, height, 2, mvd );
 }
-static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
+static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, int8_t ref )
 {
     void *ref_cache = &h->mb.cache.ref[i_list][X264_SCAN8_0+x+8*y];
     if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) )
-        x264_cache_ref_func_table[width + (height<<1)-3]( ref_cache, ref );
+        x264_cache_ref_func_table[width + (height<<1)-3]( ref_cache, (uint8_t)ref );
     else
-        x264_macroblock_cache_rect( ref_cache, width, height, 1, ref );
+        x264_macroblock_cache_rect( ref_cache, width, height, 1, (uint8_t)ref );
 }
 static ALWAYS_INLINE void x264_macroblock_cache_skip( x264_t *h, int x, int y, int width, int height, int b_skip )
 {

diff --git a/common/vlc.c b/common/vlc.c
@@ -37,7 +37,7 @@ void x264_cavlc_init( x264_t *h )
         {
             int mask = level >> 15;
             int abs_level = (level^mask)-mask;
-            int i_level_code = abs_level*2-mask-2;
+            int i_level_code = abs_level ? abs_level*2-mask-2 : 0;
             int i_next = i_suffix;
             vlc_large_t *vlc = &x264_level_token[i_suffix][level+LEVEL_TABLE_SIZE/2];
 

diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
@@ -34,7 +34,7 @@ SECTION .text
 ;-----------------------------------------------------------------------------
 %macro NAL_LOOP 2
 %%escape:
-    ; Detect false positive to avoid unneccessary escape loop
+    ; Detect false positive to avoid unnecessary escape loop
     xor      r3d, r3d
     cmp byte [r0+r1-1], 0
     setnz    r3b

diff --git a/common/x86/trellis-64.asm b/common/x86/trellis-64.asm
@@ -158,7 +158,7 @@ cglobal %1, 4,15,9
     %define cost_siglast   stack+80
     %define level_tree     stack+96
 
-    ; trellis_node_t is layed out differently than C.
+    ; trellis_node_t is laid out differently than C.
     ; struct-of-arrays rather than array-of-structs, for simd.
     %define nodes_curq r7
     %define nodes_prevq r8

diff --git a/common/x86/util.h b/common/x86/util.h
@@ -121,7 +121,7 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t
          "m"(pb_2),"m"(pb_32),"m"(pb_33)
         :"mm0", "mm1", "mm2"
     );
-    return amvd;
+    return (uint16_t)amvd;
 }
 
 #define x264_predictor_clip x264_predictor_clip_mmx2

diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
@@ -86,6 +86,11 @@
     %define mangle(x) x
 %endif
 
+; Use VEX-encoding even in non-AVX functions
+%ifndef FORCE_VEX_ENCODING
+    %define FORCE_VEX_ENCODING 0
+%endif
+
 %macro SECTION_RODATA 0-1 16
     %ifidn __OUTPUT_FORMAT__,win32
         SECTION .rdata align=%1
@@ -1014,7 +1019,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %endmacro
 
 %macro INIT_XMM 0-1+
-    %assign avx_enabled 0
+    %assign avx_enabled FORCE_VEX_ENCODING
     %define RESET_MM_PERMUTATION INIT_XMM %1
     %define mmsize 16
     %define mova movdqa
@@ -1339,26 +1344,50 @@ INIT_XMM
     %elif %0 >= 9
         __instr %6, %7, %8, %9
     %elif %0 == 8
-        %if avx_enabled && %5
+        %if avx_enabled && __sizeofreg >= 16 && %4 == 0
             %xdefine __src1 %7
             %xdefine __src2 %8
-            %ifnum regnumof%7
-                %ifnum regnumof%8
-                    %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
-                        ; Most VEX-encoded instructions require an additional byte to encode when
-                        ; src2 is a high register (e.g. m8..15). If the instruction is commutative
-                        ; we can swap src1 and src2 when doing so reduces the instruction length.
-                        %xdefine __src1 %8
-                        %xdefine __src2 %7
+            %if %5
+                %ifnum regnumof%7
+                    %ifnum regnumof%8
+                        %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+                            ; Most VEX-encoded instructions require an additional byte to encode when
+                            ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+                            ; we can swap src1 and src2 when doing so reduces the instruction length.
+                            %xdefine __src1 %8
+                            %xdefine __src2 %7
+                        %endif
                     %endif
+                %elifnum regnumof%8 ; put memory operands in src2 when possible
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %else
+                    %assign __emulate_avx 1
+                %endif
+            %elifnnum regnumof%7
+                ; EVEX allows imm8 shift instructions to be used with memory operands,
+                ; but VEX does not. This handles those special cases.
+                %ifnnum %8
+                    %assign __emulate_avx 1
+                %elif notcpuflag(avx512)
+                    %assign __emulate_avx 1
                 %endif
             %endif
-            __instr %6, __src1, __src2
+            %if __emulate_avx ; a separate load is required
+                %if %3
+                    vmovaps %6, %7
+                %else
+                    vmovdqa %6, %7
+                %endif
+                __instr %6, %8
+            %else
+                __instr %6, __src1, __src2
+            %endif
         %else
             __instr %6, %7, %8
         %endif
     %elif %0 == 7
-        %if avx_enabled && %5
+        %if avx_enabled && __sizeofreg >= 16 && %5
             %xdefine __src1 %6
             %xdefine __src2 %7
             %ifnum regnumof%6

diff --git a/config.guess b/config.guess
@@ -1083,7 +1083,7 @@ EOF
 	# uname -m prints for DJGPP always 'pc', but it prints nothing about
 	# the processor, so we play safe by assuming i586.
 	# Note: whatever this is, it MUST be the same as what config.sub
-	# prints for the "djgpp" host, or else GDB configury will decide that
+	# prints for the "djgpp" host, or else GDB configure will decide that
 	# this is a cross-build.
 	echo i586-pc-msdosdjgpp
 	exit ;;

diff --git a/doc/threads.txt b/doc/threads.txt
@@ -33,7 +33,7 @@ I enabled the various components of slicing one at a time, and measured the port
     * 16% reset cabac contexts
     * 6% deblocking between slices (you don't strictly have to turn this off just for standard compliance, but you do if you want to use slices for decoder multithreading)
     * 2% cabac neighbors (cbp, skip, etc)
-The proportional cost of redundant headers should certainly depend on bitrate (since the header size is constant and everything else depends on bitrate). Deblocking should too (due to varing deblock strength).
+The proportional cost of redundant headers should certainly depend on bitrate (since the header size is constant and everything else depends on bitrate). Deblocking should too (due to varying deblock strength).
 But none of the proportions should depend strongly on the number of slices: some are triggered per slice while some are triggered per macroblock-that's-on-the-edge-of-a-slice, but as long as there's no more than 1 slice per row, the relative frequency of those two conditions is determined solely by the image width.
 
 

diff --git a/encoder/analyse.c b/encoder/analyse.c
@@ -1346,7 +1346,7 @@ static void mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
             i_maxref = ref;\
     }
 
-    /* early termination: if 16x16 chose ref 0, then evalute no refs older
+    /* early termination: if 16x16 chose ref 0, then evaluate no refs older
      * than those used by the neighbors */
     if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
         h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
@@ -2173,7 +2173,7 @@ static void mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
     ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
     int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
 
-    /* early termination: if 16x16 chose ref 0, then evalute no refs older
+    /* early termination: if 16x16 chose ref 0, then evaluate no refs older
      * than those used by the neighbors */
     #define CHECK_NEIGHBOUR(i)\
     {\

diff --git a/encoder/lookahead.c b/encoder/lookahead.c
@@ -89,9 +89,14 @@ static void lookahead_slicetype_decide( x264_t *h )
 
 REALIGN_STACK static void *lookahead_thread( x264_t *h )
 {
-    while( !h->lookahead->b_exit_thread )
+    while( 1 )
     {
         x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+        if( h->lookahead->b_exit_thread )
+        {
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+            break;
+        }
         x264_pthread_mutex_lock( &h->lookahead->next.mutex );
         int shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
         lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );

diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
@@ -608,7 +608,7 @@ int x264_reference_build_list_optimal( x264_t *h )
             COPY2_IF_GT( max, refcount[i], bestref, i );
 
         /* FIXME: If there are duplicates from frames other than ref0 then it is possible
-         * that the optimal ordering doesnt place every duplicate. */
+         * that the optimal ordering doesn't place every duplicate. */
 
         refcount[bestref] = -1;
         h->fref[0][ref] = frames[bestref];
@@ -1661,7 +1661,7 @@ int x264_ratecontrol_mb( x264_t *h, int bits )
         float b1 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices;
         float trust_coeff = x264_clip3f( bits_so_far / slice_size_planned, 0.0, 1.0 );
 
-        /* Don't increase the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
+        /* Don't increase the row QPs until a sufficient amount of the bits of the frame have been processed, in case a flat */
         /* area at the top of the frame was measured inaccurately. */
         if( trust_coeff < 0.05f )
             qp_max = qp_absolute_max = prev_row_qp;