diff --git a/AUTHORS b/AUTHORS
index d14deb881..d7fda64a9 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -58,7 +58,7 @@ S: Sweden
 N: Laurent Aimar
 E: fenrir AT videolan DOT org
 C: fenrir
-D: Intial import, former maintainer
+D: Initial import, former maintainer
 D: x86 asm (mmx/mmx2)
 S: France
 
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index 1b739f278..96db7f201 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -210,7 +210,7 @@ static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if( (mvy&3) == 3 )             // explict if() to force conditional add
+    if( (mvy&3) == 3 )             // explicit if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
@@ -236,7 +236,7 @@ static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if( (mvy&3) == 3 )             // explict if() to force conditional add
+    if( (mvy&3) == 3 )             // explicit if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index c33549286..6dff47234 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -218,7 +218,7 @@ static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if( (mvy&3) == 3 )             // explict if() to force conditional add
+    if( (mvy&3) == 3 )             // explicit if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
@@ -244,7 +244,7 @@ static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
-    if( (mvy&3) == 3 )             // explict if() to force conditional add
+    if( (mvy&3) == 3 )             // explicit if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
diff --git a/common/bitstream.c b/common/bitstream.c
index c45f86251..60459b703 100644
--- a/common/bitstream.c
+++ b/common/bitstream.c
@@ -92,10 +92,10 @@ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
     {
         /* Size doesn't include the size of the header we're writing now. */
         int chunk_size = size - 4;
-        orig_dst[0] = chunk_size >> 24;
-        orig_dst[1] = chunk_size >> 16;
-        orig_dst[2] = chunk_size >> 8;
-        orig_dst[3] = chunk_size >> 0;
+        orig_dst[0] = (uint8_t)(chunk_size >> 24);
+        orig_dst[1] = (uint8_t)(chunk_size >> 16);
+        orig_dst[2] = (uint8_t)(chunk_size >> 8);
+        orig_dst[3] = (uint8_t)(chunk_size >> 0);
     }
 
     nal->i_payload = size;
diff --git a/common/cabac.c b/common/cabac.c
index 9c699953c..010580d5c 100644
--- a/common/cabac.c
+++ b/common/cabac.c
@@ -89,10 +89,10 @@ static inline void cabac_putbyte( x264_cabac_t *cb )
             cb->p[-1] += carry;
             while( bytes_outstanding > 0 )
             {
-                *(cb->p++) = carry-1;
+                *(cb->p++) = (uint8_t)(carry-1);
                 bytes_outstanding--;
             }
-            *(cb->p++) = out;
+            *(cb->p++) = (uint8_t)out;
             cb->i_bytes_outstanding = 0;
         }
     }
diff --git a/common/deblock.c b/common/deblock.c
index 5b52d9d9e..5779ac11b 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -655,11 +655,11 @@ void x264_macroblock_deblock( x264_t *h )
     } while( 0 )
 
     if( !transform_8x8 ) FILTER( 0, 1 );
-                         FILTER( 0, 2 );
+    FILTER( 0, 2 );
     if( !transform_8x8 ) FILTER( 0, 3 );
 
     if( !transform_8x8 ) FILTER( 1, 1 );
-                         FILTER( 1, 2 );
+    FILTER( 1, 2 );
     if( !transform_8x8 ) FILTER( 1, 3 );
 
     #undef FILTER
diff --git a/common/macroblock.h b/common/macroblock.h
index 42e551817..a8ed58081 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -361,7 +361,7 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed );
  *      uses all neighbors, even those that didn't end up using this ref.
  *      h->mb. need only valid values from other blocks */
 #define x264_mb_predict_mv_ref16x16 x264_template(mb_predict_mv_ref16x16)
-void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc );
+void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t (*mvc)[2], int *i_mvc );
 
 #define x264_mb_mc x264_template(mb_mc)
 void x264_mb_mc( x264_t *h );
diff --git a/common/mc.h b/common/mc.h
index 4b55dbfe6..8c12b9e0c 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -61,8 +61,8 @@ static void mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t
             if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
                 continue;\
 \
-            unsigned mbx = current[0];\
-            unsigned mby = current[1];\
+            unsigned mbx = (unsigned)current[0];\
+            unsigned mby = (unsigned)current[1];\
             unsigned idx0 = mbx + mby * stride;\
             unsigned idx2 = idx0 + stride;\
 \
diff --git a/common/mvpred.c b/common/mvpred.c
index b8f913e91..c00dc9dfe 100644
--- a/common/mvpred.c
+++ b/common/mvpred.c
@@ -171,8 +171,8 @@ void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
     int16_t *mv_b  = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
 
     if( i_refa == -2 || i_refb == -2 ||
-        !( i_refa | M32( mv_a ) ) ||
-        !( i_refb | M32( mv_b ) ) )
+        !( (uint32_t)i_refa | M32( mv_a ) ) ||
+        !( (uint32_t)i_refb | M32( mv_b ) ) )
     {
         M32( mv ) = 0;
     }
@@ -304,7 +304,7 @@ static ALWAYS_INLINE int mb_predict_mv_direct16x16_spatial( x264_t *h, int b_int
             mv_c   = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1];
         }
 
-        int i_ref = X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc );
+        int i_ref = (int)X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc );
         if( i_ref < 0 )
         {
             i_ref = -1;
@@ -516,7 +516,7 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
 }
 
 /* This just improves encoder performance, it's not part of the spec */
-void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc )
+void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t (*mvc)[2], int *i_mvc )
 {
     int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
     int i = 0;
diff --git a/common/osdep.h b/common/osdep.h
index 082144db0..556bf5840 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -289,7 +289,7 @@ static inline int x264_is_regular_file( FILE *filehandle )
 #define ALIGNED_8( var )  DECLARE_ALIGNED( var, 8 )
 #define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
 
-// ARM compiliers don't reliably align stack variables
+// ARM compilers don't reliably align stack variables
 // - EABI requires only 8 byte stack alignment to be maintained
 // - gcc can't align stack variables to more even if the stack were to be correctly aligned outside the function
 // - armcc can't either, but is nice enough to actually tell you so
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 1e00bcf44..419621b05 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -1262,7 +1262,7 @@ static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1,
     SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
                     sa8d4v, sa8d5v, sa8d6v, sa8d7v );
 
-    /* accumulation of the absolute value of all elements of the resulting bloc */
+    /* accumulation of the absolute value of all elements of the resulting block */
     vec_s16_t abs0v = VEC_ABS(sa8d0v);
     vec_s16_t abs1v = VEC_ABS(sa8d1v);
     vec_s16_t sum01v = vec_add(abs0v, abs1v);
diff --git a/common/rectangle.h b/common/rectangle.h
index 07583df58..3849c0320 100644
--- a/common/rectangle.h
+++ b/common/rectangle.h
@@ -28,8 +28,8 @@
 static ALWAYS_INLINE void x264_macroblock_cache_rect( void *dst, int w, int h, int s, uint32_t v )
 {
     uint8_t *d = dst;
-    uint16_t v2 = s == 2 ? v : v * 0x101;
-    uint32_t v4 = s == 4 ? v : s == 2 ? v * 0x10001 : v * 0x1010101;
+    uint16_t v2 = s >= 2 ? v : v * 0x101;
+    uint32_t v4 = s >= 4 ? v : s >= 2 ? v * 0x10001 : v * 0x1010101;
     uint64_t v8 = v4 + ((uint64_t)v4 << 32);
     s *= 8;
 
@@ -142,13 +142,13 @@ static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, in
     else
         x264_macroblock_cache_rect( mvd_cache, width*2, height, 2, mvd );
 }
-static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
+static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, int8_t ref )
 {
     void *ref_cache = &h->mb.cache.ref[i_list][X264_SCAN8_0+x+8*y];
     if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) )
-        x264_cache_ref_func_table[width + (height<<1)-3]( ref_cache, ref );
+        x264_cache_ref_func_table[width + (height<<1)-3]( ref_cache, (uint8_t)ref );
     else
-        x264_macroblock_cache_rect( ref_cache, width, height, 1, ref );
+        x264_macroblock_cache_rect( ref_cache, width, height, 1, (uint8_t)ref );
 }
 static ALWAYS_INLINE void x264_macroblock_cache_skip( x264_t *h, int x, int y, int width, int height, int b_skip )
 {
diff --git a/common/vlc.c b/common/vlc.c
index 13a668b81..def68f997 100644
--- a/common/vlc.c
+++ b/common/vlc.c
@@ -37,7 +37,7 @@ void x264_cavlc_init( x264_t *h )
         {
             int mask = level >> 15;
             int abs_level = (level^mask)-mask;
-            int i_level_code = abs_level*2-mask-2;
+            int i_level_code = abs_level ? abs_level*2-mask-2 : 0;
             int i_next = i_suffix;
             vlc_large_t *vlc = &x264_level_token[i_suffix][level+LEVEL_TABLE_SIZE/2];
 
diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm
index 28cd66b3b..43b6395c2 100644
--- a/common/x86/bitstream-a.asm
+++ b/common/x86/bitstream-a.asm
@@ -34,7 +34,7 @@ SECTION .text
 ;-----------------------------------------------------------------------------
 %macro NAL_LOOP 2
 %%escape:
-    ; Detect false positive to avoid unneccessary escape loop
+    ; Detect false positive to avoid unnecessary escape loop
     xor      r3d, r3d
     cmp byte [r0+r1-1], 0
     setnz    r3b
diff --git a/common/x86/trellis-64.asm b/common/x86/trellis-64.asm
index 3f57a25c5..aed20478b 100644
--- a/common/x86/trellis-64.asm
+++ b/common/x86/trellis-64.asm
@@ -158,7 +158,7 @@ cglobal %1, 4,15,9
     %define cost_siglast   stack+80
     %define level_tree     stack+96
 
-    ; trellis_node_t is layed out differently than C.
+    ; trellis_node_t is laid out differently than C.
     ; struct-of-arrays rather than array-of-structs, for simd.
     %define nodes_curq r7
     %define nodes_prevq r8
diff --git a/common/x86/util.h b/common/x86/util.h
index c057298a4..77a99313b 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -121,7 +121,7 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t
          "m"(pb_2),"m"(pb_32),"m"(pb_33)
         :"mm0", "mm1", "mm2"
     );
-    return amvd;
+    return (uint16_t)amvd;
 }
 
 #define x264_predictor_clip x264_predictor_clip_mmx2
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 0e4f4f9df..0ead8f66d 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -86,6 +86,11 @@
     %define mangle(x) x
 %endif
 
+; Use VEX-encoding even in non-AVX functions
+%ifndef FORCE_VEX_ENCODING
+    %define FORCE_VEX_ENCODING 0
+%endif
+
 %macro SECTION_RODATA 0-1 16
     %ifidn __OUTPUT_FORMAT__,win32
         SECTION .rdata align=%1
@@ -1014,7 +1019,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %endmacro
 
 %macro INIT_XMM 0-1+
-    %assign avx_enabled 0
+    %assign avx_enabled FORCE_VEX_ENCODING
     %define RESET_MM_PERMUTATION INIT_XMM %1
     %define mmsize 16
     %define mova movdqa
@@ -1339,26 +1344,50 @@ INIT_XMM
     %elif %0 >= 9
         __instr %6, %7, %8, %9
     %elif %0 == 8
-        %if avx_enabled && %5
+        %if avx_enabled && __sizeofreg >= 16 && %4 == 0
             %xdefine __src1 %7
             %xdefine __src2 %8
-            %ifnum regnumof%7
-                %ifnum regnumof%8
-                    %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
-                        ; Most VEX-encoded instructions require an additional byte to encode when
-                        ; src2 is a high register (e.g. m8..15). If the instruction is commutative
-                        ; we can swap src1 and src2 when doing so reduces the instruction length.
-                        %xdefine __src1 %8
-                        %xdefine __src2 %7
+            %if %5
+                %ifnum regnumof%7
+                    %ifnum regnumof%8
+                        %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+                            ; Most VEX-encoded instructions require an additional byte to encode when
+                            ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+                            ; we can swap src1 and src2 when doing so reduces the instruction length.
+                            %xdefine __src1 %8
+                            %xdefine __src2 %7
+                        %endif
                     %endif
+                %elifnum regnumof%8 ; put memory operands in src2 when possible
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %else
+                    %assign __emulate_avx 1
+                %endif
+            %elifnnum regnumof%7
+                ; EVEX allows imm8 shift instructions to be used with memory operands,
+                ; but VEX does not. This handles those special cases.
+                %ifnnum %8
+                    %assign __emulate_avx 1
+                %elif notcpuflag(avx512)
+                    %assign __emulate_avx 1
                 %endif
             %endif
-            __instr %6, __src1, __src2
+            %if __emulate_avx ; a separate load is required
+                %if %3
+                    vmovaps %6, %7
+                %else
+                    vmovdqa %6, %7
+                %endif
+                __instr %6, %8
+            %else
+                __instr %6, __src1, __src2
+            %endif
         %else
             __instr %6, %7, %8
         %endif
     %elif %0 == 7
-        %if avx_enabled && %5
+        %if avx_enabled && __sizeofreg >= 16 && %5
             %xdefine __src1 %6
             %xdefine __src2 %7
             %ifnum regnumof%6
diff --git a/config.guess b/config.guess
index ab192f67c..d437be008 100755
--- a/config.guess
+++ b/config.guess
@@ -1083,7 +1083,7 @@ EOF
 	# uname -m prints for DJGPP always 'pc', but it prints nothing about
 	# the processor, so we play safe by assuming i586.
 	# Note: whatever this is, it MUST be the same as what config.sub
-	# prints for the "djgpp" host, or else GDB configury will decide that
+	# prints for the "djgpp" host, or else GDB configure will decide that
 	# this is a cross-build.
 	echo i586-pc-msdosdjgpp
 	exit ;;
diff --git a/doc/threads.txt b/doc/threads.txt
index cea1f6576..4b0ee306e 100644
--- a/doc/threads.txt
+++ b/doc/threads.txt
@@ -33,7 +33,7 @@ I enabled the various components of slicing one at a time, and measured the port
     * 16% reset cabac contexts
     * 6% deblocking between slices (you don't strictly have to turn this off just for standard compliance, but you do if you want to use slices for decoder multithreading)
     * 2% cabac neighbors (cbp, skip, etc)
-The proportional cost of redundant headers should certainly depend on bitrate (since the header size is constant and everything else depends on bitrate). Deblocking should too (due to varing deblock strength).
+The proportional cost of redundant headers should certainly depend on bitrate (since the header size is constant and everything else depends on bitrate). Deblocking should too (due to varying deblock strength).
 But none of the proportions should depend strongly on the number of slices: some are triggered per slice while some are triggered per macroblock-that's-on-the-edge-of-a-slice, but as long as there's no more than 1 slice per row, the relative frequency of those two conditions is determined solely by the image width.
 
 
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 5e32e223e..48bb72a21 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1346,7 +1346,7 @@ static void mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
             i_maxref = ref;\
     }
 
-    /* early termination: if 16x16 chose ref 0, then evalute no refs older
+    /* early termination: if 16x16 chose ref 0, then evaluate no refs older
      * than those used by the neighbors */
     if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
         h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
@@ -2173,7 +2173,7 @@ static void mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
     ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
     int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
 
-    /* early termination: if 16x16 chose ref 0, then evalute no refs older
+    /* early termination: if 16x16 chose ref 0, then evaluate no refs older
      * than those used by the neighbors */
     #define CHECK_NEIGHBOUR(i)\
     {\
diff --git a/encoder/lookahead.c b/encoder/lookahead.c
index 18958bd0a..93f59b190 100644
--- a/encoder/lookahead.c
+++ b/encoder/lookahead.c
@@ -89,9 +89,14 @@ static void lookahead_slicetype_decide( x264_t *h )
 
 REALIGN_STACK static void *lookahead_thread( x264_t *h )
 {
-    while( !h->lookahead->b_exit_thread )
+    while( 1 )
     {
         x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+        if( h->lookahead->b_exit_thread )
+        {
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+            break;
+        }
         x264_pthread_mutex_lock( &h->lookahead->next.mutex );
         int shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
         lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 9a6ea1c9f..29b49dab0 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -608,7 +608,7 @@ int x264_reference_build_list_optimal( x264_t *h )
             COPY2_IF_GT( max, refcount[i], bestref, i );
 
         /* FIXME: If there are duplicates from frames other than ref0 then it is possible
-         * that the optimal ordering doesnt place every duplicate. */
+         * that the optimal ordering doesn't place every duplicate. */
 
         refcount[bestref] = -1;
         h->fref[0][ref] = frames[bestref];
@@ -1661,7 +1661,7 @@ int x264_ratecontrol_mb( x264_t *h, int bits )
         float b1 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices;
         float trust_coeff = x264_clip3f( bits_so_far / slice_size_planned, 0.0, 1.0 );
 
-        /* Don't increase the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
+        /* Don't increase the row QPs until a sufficient amount of the bits of the frame have been processed, in case a flat */
         /* area at the top of the frame was measured inaccurately. */
         if( trust_coeff < 0.05f )
             qp_max = qp_absolute_max = prev_row_qp;
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 93170437a..4e8300026 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -141,7 +141,8 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
             int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, (pixel*)x264_zero, 0 ) >> 1;
             satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, (pixel*)x264_zero, 0 ) - dc - cached_satd( h, size, x, y ));
         }
-        satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
+        int64_t tmp = ((int64_t)satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
+        satd = X264_MIN( tmp, COST_MAX );
     }
     return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
 }
@@ -470,7 +471,7 @@ int trellis_dc_shortcut( int sign_coef, int quant_coef, int unquant_mf, int coef
 
         /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
         int d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);
-        uint64_t score = (uint64_t)d*d * coef_weight;
+        uint64_t score = (int64_t)d*d * coef_weight;
 
         /* code the proposed level, and count how much entropy it would take */
         if( abs_level )
@@ -733,11 +734,11 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
     trellis_level_t level_tree[64*8*2];
     int levels_used = 1;
     /* init trellis */
-    trellis_node_t nodes[2][8];
+    trellis_node_t nodes[2][8] = {0};
     trellis_node_t *nodes_cur = nodes[0];
     trellis_node_t *nodes_prev = nodes[1];
     trellis_node_t *bnode;
-    for( int j = 1; j < 4; j++ )
+    for( int j = 1; j < 8; j++ )
         nodes_cur[j].score = TRELLIS_SCORE_MAX;
     nodes_cur[0].score = TRELLIS_SCORE_BIAS;
     nodes_cur[0].level_idx = 0;
@@ -824,17 +825,18 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
                 int predicted_coef = orig_coef - sign_coef;\
                 int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));\
                 int psy_weight = coef_weight1[zigzag[i]] * h->mb.i_psy_trellis;\
-                ssd1[k] = (uint64_t)d*d * coef_weight2[zigzag[i]] - psy_weight * psy_value;\
+                int64_t tmp = (int64_t)d*d * coef_weight2[zigzag[i]] - (int64_t)psy_weight * psy_value;\
+                ssd1[k] = (uint64_t)tmp;\
             }\
             else\
             /* FIXME: for i16x16 dc is this weight optimal? */\
-                ssd1[k] = (uint64_t)d*d * (dc?256:coef_weight2[zigzag[i]]);\
+                ssd1[k] = (int64_t)d*d * (dc?256:coef_weight2[zigzag[i]]);\
             ssd0[k] = ssd1[k];\
             if( !i && !dc && !ctx_hi )\
             {\
                 /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */\
                 d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);\
-                ssd0[k] = (uint64_t)d*d * coef_weight2[zigzag[i]];\
+                ssd0[k] = (int64_t)d*d * coef_weight2[zigzag[i]];\
             }\
         }\
 \
@@ -925,7 +927,7 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
     ALIGNED_ARRAY_16( dctcoef, coefs,[16] );
     const uint32_t *coef_weight1 = b_8x8 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
     const uint32_t *coef_weight2 = b_8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
-    int delta_distortion[16];
+    int64_t delta_distortion[16];
     int64_t score = 1ULL<<62;
     int i, j;
     const int f = 1<<15;
@@ -952,7 +954,7 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
 
     /* Find last non-zero coefficient. */
     for( i = end; i >= start; i -= step )
-        if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
+        if( abs(dct[zigzag[i]]) * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) >= f )
             break;
 
     if( i < start )
@@ -985,7 +987,7 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
             int unquant0 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-1) + 128) >> 8);
             int d1 = abs_coef - unquant1;
             int d0 = abs_coef - unquant0;
-            delta_distortion[i] = (d0*d0 - d1*d1) * (dc?256:coef_weight2[zigzag[j]]);
+            delta_distortion[i] = (int64_t)(d0*d0 - d1*d1) * (dc?256:coef_weight2[zigzag[j]]);
 
             /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
             if( h->mb.i_psy_trellis && j && !dc && !b_chroma )
@@ -1023,7 +1025,7 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
     while( 1 )
     {
         int64_t iter_score = score;
-        int iter_distortion_delta = 0;
+        int64_t iter_distortion_delta = 0;
         int iter_coef = -1;
         int iter_mask = coef_mask;
         int iter_round = round_mask;
@@ -1038,7 +1040,7 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
             int old_coef = coefs[i];
             int new_coef = quant_coefs[round_change][i];
             int cur_mask = (coef_mask&~(1 << i))|(!!new_coef << i);
-            int cur_distortion_delta = delta_distortion[i] * (round_change ? -1 : 1);
+            int64_t cur_distortion_delta = delta_distortion[i] * (round_change ? -1 : 1);
             int64_t cur_score = cur_distortion_delta;
             coefs[i] = new_coef;
 
diff --git a/encoder/set.c b/encoder/set.c
index ec79985c8..67b0ad5d5 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -865,6 +865,7 @@ int x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len )
     return 0;
 }
 
+#undef ERROR
 #define ERROR(...)\
 {\
     if( verbose )\
diff --git a/encoder/slicetype-cl.c b/encoder/slicetype-cl.c
index e31949913..9dbd1f21a 100644
--- a/encoder/slicetype-cl.c
+++ b/encoder/slicetype-cl.c
@@ -283,7 +283,7 @@ int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
     return 0;
 }
 
-/* This function was tested emprically on a number of AMD and NV GPUs.  Making a
+/* This function was tested empirically on a number of AMD and NV GPUs.  Making a
  * function which returns perfect launch dimensions is impossible; some
  * applications will have self-tuning code to try many possible variables and
  * measure the runtime.  Here we simply make an educated guess based on what we
diff --git a/filters/video/resize.c b/filters/video/resize.c
index 532dbd3c9..c3218df0a 100644
--- a/filters/video/resize.c
+++ b/filters/video/resize.c
@@ -128,7 +128,7 @@ static uint32_t convert_method_to_flag( const char *name )
         flag = SWS_AREA;
     else if( !strcasecmp( name, "bicublin" ) )
         flag = SWS_BICUBLIN;
-    else if( !strcasecmp( name, "guass" ) )
+    else if( !strcasecmp( name, "gauss" ) )
         flag = SWS_GAUSS;
     else if( !strcasecmp( name, "sinc" ) )
         flag = SWS_SINC;
diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
index b269437c6..d5b9b1d66 100644
--- a/output/matroska_ebml.c
+++ b/output/matroska_ebml.c
@@ -60,7 +60,7 @@ struct mk_writer
     int64_t cluster_tc_scaled;
     int64_t frame_tc, max_frame_tc;
 
-    char wrote_header, in_frame, keyframe, skippable;
+    int8_t wrote_header, in_frame, keyframe, skippable;
 };
 
 static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id )
@@ -111,7 +111,7 @@ static int mk_append_context_data( mk_context *c, const void *data, unsigned siz
         c->d_max = dn;
     }
 
-    memcpy( (char*)c->data + c->d_cur, data, size );
+    memcpy( (uint8_t*)c->data + c->d_cur, data, size );
 
     c->d_cur = ns;
 
@@ -120,7 +120,7 @@ static int mk_append_context_data( mk_context *c, const void *data, unsigned siz
 
 static int mk_write_id( mk_context *c, unsigned id )
 {
-    unsigned char c_id[4] = { id >> 24, id >> 16, id >> 8, id };
+    uint8_t c_id[4] = { id >> 24, id >> 16, id >> 8, id };
 
     if( c_id[0] )
         return mk_append_context_data( c, c_id, 4 );
@@ -133,7 +133,7 @@ static int mk_write_id( mk_context *c, unsigned id )
 
 static int mk_write_size( mk_context *c, unsigned size )
 {
-    unsigned char c_size[5] = { 0x08, size >> 24, size >> 16, size >> 8, size };
+    uint8_t c_size[5] = { 0x08, size >> 24, size >> 16, size >> 8, size };
 
     if( size < 0x7f )
     {
@@ -160,7 +160,7 @@ static int mk_write_size( mk_context *c, unsigned size )
 
 static int mk_flush_context_id( mk_context *c )
 {
-    unsigned char ff = 0xff;
+    uint8_t ff = 0xff;
 
     if( !c->id )
         return 0;
@@ -249,9 +249,9 @@ static int mk_write_bin( mk_context *c, unsigned id, const void *data, unsigned
     return 0;
 }
 
-static int mk_write_uint( mk_context *c, unsigned id, int64_t ui )
+static int mk_write_uint( mk_context *c, unsigned id, uint64_t ui )
 {
-    unsigned char c_ui[8] = { ui >> 56, ui >> 48, ui >> 40, ui >> 32, ui >> 24, ui >> 16, ui >> 8, ui };
+    uint8_t c_ui[8] = { ui >> 56, ui >> 48, ui >> 40, ui >> 32, ui >> 24, ui >> 16, ui >> 8, ui };
     unsigned i = 0;
 
     CHECK( mk_write_id( c, id ) );
@@ -267,9 +267,9 @@ static int mk_write_float_raw( mk_context *c, float f )
     union
     {
         float f;
-        unsigned u;
+        uint32_t u;
     } u;
-    unsigned char c_f[4];
+    uint8_t c_f[4];
 
     u.f = f;
     c_f[0] = u.u >> 24;
@@ -408,7 +408,7 @@ static int mk_flush_frame( mk_writer *w )
 {
     int64_t delta;
     unsigned fsize;
-    unsigned char c_delta_flags[3];
+    uint8_t c_delta_flags[3];
 
     if( !w->in_frame )
         return 0;
@@ -435,8 +435,8 @@ static int mk_flush_frame( mk_writer *w )
     CHECK( mk_write_size( w->cluster, fsize + 4 ) ); // Size
     CHECK( mk_write_size( w->cluster, 1 ) ); // TrackNumber
 
-    c_delta_flags[0] = delta >> 8;
-    c_delta_flags[1] = delta;
+    c_delta_flags[0] = (uint8_t)(delta >> 8);
+    c_delta_flags[1] = (uint8_t)delta;
     c_delta_flags[2] = (w->keyframe << 7) | w->skippable;
     CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) ); // Timecode, Flags
     if( w->frame )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 1e84ecb72..5bc920076 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -262,7 +262,7 @@ intptr_t (*x264_checkasm_call)( intptr_t (*func)(), int *ok, ... ) = x264_checka
  * assembly function through x264_checkasm_call with added dummy arguments which forces all
  * real arguments to be passed on the stack and not in registers. For 32-bit argument the
  * upper half of the 64-bit register location on the stack will now contain junk. Note that
- * this is dependant on compiler behaviour and that interrupts etc. at the wrong time may
+ * this is dependent on compiler behaviour and that interrupts etc. at the wrong time may
  * overwrite the junk written to the stack so there's no guarantee that it will always
  * detect all functions that assumes zero-extension.
  */
@@ -2958,7 +2958,7 @@ REALIGN_STACK int main( int argc, char **argv )
         argv++;
     }
 
-    int seed = ( argc > 1 ) ? atoi(argv[1]) : x264_mdate();
+    unsigned seed = ( argc > 1 ) ? strtoul(argv[1], NULL, 0) : x264_mdate();
     fprintf( stderr, "x264: using random seed %u\n", seed );
     srand( seed );
 
diff --git a/tools/digress/cli.py b/tools/digress/cli.py
index 44158a49b..ecee71af4 100644
--- a/tools/digress/cli.py
+++ b/tools/digress/cli.py
@@ -139,7 +139,7 @@ def dispatch(self):
         args = self.optparse.parse_args()[1] # arguments may require reparsing after pre_dispatch; see test_x264.py
 
         if len(args) == 0:
-            print >>sys.stderr, "error: no comamnd specified\n"
+            print >>sys.stderr, "error: no command specified\n"
             self.optparse.print_help()
             return
 
diff --git a/x264.c b/x264.c
index 747bf3c99..8eb40840d 100644
--- a/x264.c
+++ b/x264.c
@@ -1302,7 +1302,7 @@ static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info,
 {
     x264_register_vid_filters();
 
-    /* intialize baseline filters */
+    /* initialize baseline filters */
     if( x264_init_vid_filter( "source", handle, &filter, info, param, NULL ) ) /* wrap demuxer into a filter */
         return -1;
     if( x264_init_vid_filter( "resize", handle, &filter, info, param, "normcsp" ) ) /* normalize csps to be of a known/supported format */
diff --git a/x264.h b/x264.h
index d685efbd2..2c42623b1 100644
--- a/x264.h
+++ b/x264.h
@@ -521,7 +521,7 @@ typedef struct x264_param_t
     int i_sps_id;               /* SPS and PPS id number */
     int b_vfr_input;            /* VFR input.  If 1, use timebase and timestamps for ratecontrol purposes.
                                  * If 0, use fps only. */
-    int b_pulldown;             /* use explicity set timebase for CFR */
+    int b_pulldown;             /* use explicitly set timebase for CFR */
     uint32_t i_fps_num;
     uint32_t i_fps_den;
     uint32_t i_timebase_num;    /* Timebase numerator */