[SCM] x264 packaging branch, ubuntu, updated. debian/0.85.1442.1+git781d30-1-2-g11c4c4e

Sat Feb 20 11:22:36 UTC 2010

The following commit has been merged in the ubuntu branch:
commit 4f815c28198ee157915dd4147b55563078ca59d1
Author: Reinhard Tartler <siretart at tauware.de>
Date:   Sat Feb 20 10:14:59 2010 +0100

    Imported Upstream version 0.85.1442.1+git781d30

diff --git a/common/arm/asm.S b/common/arm/asm.S
index d163165..395267f 100644
--- a/common/arm/asm.S
+++ b/common/arm/asm.S
@@ -20,6 +20,12 @@
 
 #include "config.h"
 
+#ifdef PREFIX
+#   define EXTERN_ASM _
+#else
+#   define EXTERN_ASM
+#endif
+
 #ifdef __ELF__
 #   define ELF
 #else
@@ -35,7 +41,8 @@ ELF     .eabi_attribute 25, \val
         .endm
 
         .macro function name
-        .global \name
+        .global EXTERN_ASM\name
+EXTERN_ASM\name:
 ELF     .hidden \name
 ELF     .type   \name, %function
         .func   \name
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index a62af39..e1db404 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -432,6 +432,311 @@ avg2_w20_loop:
 .endfunc
 
 
+.macro weight_prologue type
+    push        {r4-r5,lr}
+    ldr         r4,  [sp, #4*3]     // weight_t
+    ldr         ip,  [sp, #4*3+4]   // h
+.ifc \type, full
+    ldr         lr,  [r4, #32]      // denom
+.endif
+    ldrd        r4,  [r4, #32+4]    // scale, offset
+    vdup.16     q0,  r4
+    vdup.16     q1,  r5
+.ifc \type, full
+    rsb         lr,  lr,  #0
+    vdup.16     q2,  lr
+.endif
+.endm
+
+// void mc_weight( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+//                 const x264_weight_t *weight, int height )
+function x264_mc_weight_w20_neon
+    weight_prologue full
+    sub         r1, #16
+weight20_loop:
+    subs        ip,  #2
+    vld1.8      {d17-d19}, [r2], r3
+    vmovl.u8    q10, d17
+    vmovl.u8    q11, d18
+    vmovl.u8    q14, d19
+    vld1.8      {d16-d18}, [r2], r3
+    vmovl.u8    q12, d16
+    vmovl.u8    q13, d17
+    vmovl.u8    q15, d18
+    vmul.s16    q10, q10, q0
+    vmul.s16    q11, q11, q0
+    vmul.s16    q12, q12, q0
+    vmul.s16    q13, q13, q0
+    vmul.s16    d28, d28, d0
+    vmul.s16    d29, d30, d0
+    vrshl.s16   q10, q10, q2
+    vrshl.s16   q11, q11, q2
+    vrshl.s16   q12, q12, q2
+    vrshl.s16   q13, q13, q2
+    vrshl.s16   q14, q14, q2
+    vadd.s16    q10, q10, q1
+    vadd.s16    q11, q11, q1
+    vadd.s16    q12, q12, q1
+    vadd.s16    q13, q13, q1
+    vadd.s16    q14, q14, q1
+    vqmovun.s16 d16, q10
+    vqmovun.s16 d17, q11
+    vqmovun.s16 d18, q12
+    vqmovun.s16 d19, q13
+    vqmovun.s16 d20, q14
+    vst1.8      {d16-d17}, [r0,:128]!
+    vst1.32     {d20[0]},  [r0,:32], r1
+    vst1.8      {d18-d19}, [r0,:128]!
+    vst1.32     {d20[1]},  [r0,:32], r1
+    bgt         weight20_loop
+    pop         {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w16_neon
+    weight_prologue full
+weight16_loop:
+    subs        ip,  #2
+    vld1.8      {d16-d17}, [r2], r3
+    vld1.8      {d18-d19}, [r2], r3
+    vmovl.u8    q10, d16
+    vmovl.u8    q11, d17
+    vmovl.u8    q12, d18
+    vmovl.u8    q13, d19
+    vmul.s16    q10, q10, q0
+    vmul.s16    q11, q11, q0
+    vmul.s16    q12, q12, q0
+    vmul.s16    q13, q13, q0
+    vrshl.s16   q10, q10, q2
+    vrshl.s16   q11, q11, q2
+    vrshl.s16   q12, q12, q2
+    vrshl.s16   q13, q13, q2
+    vadd.s16    q10, q10, q1
+    vadd.s16    q11, q11, q1
+    vadd.s16    q12, q12, q1
+    vadd.s16    q13, q13, q1
+    vqmovun.s16 d16, q10
+    vqmovun.s16 d17, q11
+    vqmovun.s16 d18, q12
+    vqmovun.s16 d19, q13
+    vst1.8      {d16-d17}, [r0,:128], r1
+    vst1.8      {d18-d19}, [r0,:128], r1
+    bgt         weight16_loop
+    pop         {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w8_neon
+    weight_prologue full
+weight8_loop:
+    subs        ip,  #2
+    vld1.8      {d16}, [r2], r3
+    vld1.8      {d18}, [r2], r3
+    vmovl.u8    q8,  d16
+    vmovl.u8    q9,  d18
+    vmul.s16    q8,  q8,  q0
+    vmul.s16    q9,  q9,  q0
+    vrshl.s16   q8,  q8,  q2
+    vrshl.s16   q9,  q9,  q2
+    vadd.s16    q8,  q8,  q1
+    vadd.s16    q9,  q9,  q1
+    vqmovun.s16 d16, q8
+    vqmovun.s16 d18, q9
+    vst1.8      {d16}, [r0,:64], r1
+    vst1.8      {d18}, [r0,:64], r1
+    bgt         weight8_loop
+    pop         {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w4_neon
+    weight_prologue full
+weight4_loop:
+    subs        ip,  #2
+    vld1.32     {d16[]}, [r2], r3
+    vld1.32     {d18[]}, [r2], r3
+    vmovl.u8    q8,  d16
+    vmovl.u8    q9,  d18
+    vmul.s16    d16, d16, d0
+    vmul.s16    d17, d18, d0
+    vrshl.s16   q8,  q8,  q2
+    vadd.s16    q8,  q8,  q1
+    vqmovun.s16 d16, q8
+    vst1.32     {d16[0]}, [r0,:32], r1
+    vst1.32     {d16[1]}, [r0,:32], r1
+    bgt         weight4_loop
+    pop         {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w20_nodenom_neon
+    weight_prologue nodenom
+    sub         r1, #16
+weight20_nodenom_loop:
+    subs        ip,  #2
+    vld1.8      {d17-d19}, [r2], r3
+    vmovl.u8    q10, d17
+    vmovl.u8    q11, d18
+    vmovl.u8    q14, d19
+    vld1.8      {d16-d18}, [r2], r3
+    vmovl.u8    q12, d16
+    vmovl.u8    q13, d17
+    vmovl.u8    q15, d18
+    vmov        q8,  q1
+    vmov        q9,  q1
+    vmla.s16    q8,  q10, q0
+    vmla.s16    q9,  q11, q0
+    vmov        q10, q1
+    vmov        q11, q1
+    vmla.s16    q10, q12, q0
+    vmla.s16    q11, q13, q0
+    vmov        q12, q1
+    vmla.s16    d24, d28, d0
+    vmla.s16    d25, d30, d0
+    vqmovun.s16 d16, q8
+    vqmovun.s16 d17, q9
+    vqmovun.s16 d18, q10
+    vqmovun.s16 d19, q11
+    vqmovun.s16 d20, q12
+    vst1.8      {d16-d17}, [r0,:128]!
+    vst1.32     {d20[0]},  [r0,:32], r1
+    vst1.8      {d18-d19}, [r0,:128]!
+    vst1.32     {d20[1]},  [r0,:32], r1
+    bgt         weight20_nodenom_loop
+    pop         {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w16_nodenom_neon
+    weight_prologue nodenom
+weight16_nodenom_loop:
+    subs        ip,  #2
+    vld1.8      {d16-d17}, [r2], r3
+    vld1.8      {d18-d19}, [r2], r3
+    vmovl.u8    q12, d16
+    vmovl.u8    q13, d17
+    vmovl.u8    q14, d18
+    vmovl.u8    q15, d19
+    vmov        q8,  q1
+    vmov        q9,  q1
+    vmov        q10, q1
+    vmov        q11, q1
+    vmla.s16    q8,  q12, q0
+    vmla.s16    q9,  q13, q0
+    vmla.s16    q10, q14, q0
+    vmla.s16    q11, q15, q0
+    vqmovun.s16 d16, q8
+    vqmovun.s16 d17, q9
+    vqmovun.s16 d18, q10
+    vqmovun.s16 d19, q11
+    vst1.8      {d16-d17}, [r0,:128], r1
+    vst1.8      {d18-d19}, [r0,:128], r1
+    bgt         weight16_nodenom_loop
+    pop         {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w8_nodenom_neon
+    weight_prologue nodenom
+weight8_nodenom_loop:
+    subs        ip,  #2
+    vld1.8      {d16}, [r2], r3
+    vld1.8      {d18}, [r2], r3
+    vmovl.u8    q8,  d16
+    vmovl.u8    q9,  d18
+    vmov        q10, q1
+    vmov        q11, q1
+    vmla.s16    q10, q8,  q0
+    vmla.s16    q11, q9,  q0
+    vqmovun.s16 d16, q10
+    vqmovun.s16 d17, q11
+    vst1.8      {d16}, [r0,:64], r1
+    vst1.8      {d17}, [r0,:64], r1
+    bgt         weight8_nodenom_loop
+    pop         {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w4_nodenom_neon
+    weight_prologue nodenom
+weight4_nodenom_loop:
+    subs        ip,  #2
+    vld1.32     {d16[]}, [r2], r3
+    vld1.32     {d18[]}, [r2], r3
+    vmovl.u8    q8,  d16
+    vmovl.u8    q9,  d18
+    vmov        q10, q1
+    vmla.s16    d20, d16, d0
+    vmla.s16    d21, d18, d0
+    vqmovun.s16 d16, q10
+    vst1.32     {d16[0]}, [r0,:32], r1
+    vst1.32     {d16[1]}, [r0,:32], r1
+    bgt         weight4_nodenom_loop
+    pop         {r4-r5,pc}
+.endfunc
+
+.macro weight_simple_prologue
+    push        {lr}
+    ldr         lr,  [sp, #4]       // weight_t
+    ldr         ip,  [sp, #8]       // h
+    ldr         lr,  [lr]           // offset
+    vdup.8      q1,  lr
+.endm
+
+.macro weight_simple name op
+function x264_mc_weight_w20_\name\()_neon
+    weight_simple_prologue
+weight20_\name\()_loop:
+    subs        ip,  #2
+    vld1.8      {d16-d18}, [r2], r3
+    vld1.8      {d19-d21}, [r2], r3
+    \op         q8,  q8,  q1
+    \op         q9,  q9,  q1
+    \op         q10, q10, q1
+    vst1.8      {d16-d18}, [r0,:64], r1
+    vst1.8      {d19-d21}, [r0,:64], r1
+    bgt         weight20_\name\()_loop
+    pop         {pc}
+.endfunc
+
+function x264_mc_weight_w16_\name\()_neon
+    weight_simple_prologue
+weight16_\name\()_loop:
+    subs        ip,  #2
+    vld1.8      {d16-d17}, [r2], r3
+    vld1.8      {d18-d19}, [r2], r3
+    \op         q8,  q8,  q1
+    \op         q9,  q9,  q1
+    vst1.8      {d16-d17}, [r0,:128], r1
+    vst1.8      {d18-d19}, [r0,:128], r1
+    bgt         weight16_\name\()_loop
+    pop         {pc}
+.endfunc
+
+function x264_mc_weight_w8_\name\()_neon
+    weight_simple_prologue
+weight8_\name\()_loop:
+    subs        ip,  #2
+    vld1.8      {d16}, [r2], r3
+    vld1.8      {d17}, [r2], r3
+    \op         q8,  q8,  q1
+    vst1.8      {d16}, [r0,:64], r1
+    vst1.8      {d17}, [r0,:64], r1
+    bgt         weight8_\name\()_loop
+    pop         {pc}
+.endfunc
+
+function x264_mc_weight_w4_\name\()_neon
+    weight_simple_prologue
+weight4_\name\()_loop:
+    subs        ip,  #2
+    vld1.32     {d16[]}, [r2], r3
+    vld1.32     {d17[]}, [r2], r3
+    \op         q8,  q8,  q1
+    vst1.32     {d16[0]}, [r0,:32], r1
+    vst1.32     {d17[0]}, [r0,:32], r1
+    bgt         weight4_\name\()_loop
+    pop         {pc}
+.endfunc
+.endm
+
+weight_simple offsetadd, vqadd.u8
+weight_simple offsetsub, vqsub.u8
+
+
 // void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
 function x264_mc_copy_w4_neon
     ldr         ip,  [sp]
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index 20cf151..0a7b734 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -43,6 +43,48 @@ void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
 void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
 void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
 
+#define MC_WEIGHT(func)\
+void x264_mc_weight_w20##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
+void x264_mc_weight_w16##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
+void x264_mc_weight_w8##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
+void x264_mc_weight_w4##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
+\
+static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
+{\
+    x264_mc_weight_w4##func##_neon,\
+    x264_mc_weight_w4##func##_neon,\
+    x264_mc_weight_w8##func##_neon,\
+    x264_mc_weight_w16##func##_neon,\
+    x264_mc_weight_w16##func##_neon,\
+    x264_mc_weight_w20##func##_neon,\
+};
+
+MC_WEIGHT()
+MC_WEIGHT(_nodenom)
+MC_WEIGHT(_offsetadd)
+MC_WEIGHT(_offsetsub)
+
+static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
+{
+    if( w->i_scale == 1<<w->i_denom )
+    {
+        if( w->i_offset < 0 )
+        {
+            w->weightfn = x264_mc_offsetsub_wtab_neon;
+            w->cachea[0] = -w->i_offset;
+        }
+        else
+        {
+            w->weightfn = x264_mc_offsetadd_wtab_neon;
+            w->cachea[0] = w->i_offset;
+        }
+    }
+    else if( !w->i_denom )
+        w->weightfn = x264_mc_nodenom_wtab_neon;
+    else
+        w->weightfn = x264_mc_wtab_neon;
+}
+
 void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
 void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
 void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
@@ -182,6 +224,11 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
 
+    pf->weight    = x264_mc_wtab_neon;
+    pf->offsetadd = x264_mc_offsetadd_wtab_neon;
+    pf->offsetsub = x264_mc_offsetsub_wtab_neon;
+    pf->weight_cache = x264_weight_cache_neon;
+
 // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
 #ifndef SYS_MACOSX
     pf->memcpy_aligned  = x264_memcpy_aligned_neon;
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index 4dd65ed..d8533e5 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -110,16 +110,17 @@ SAD4_ARMV6 8
 
 .macro SAD_FUNC w, h, name, align:vararg
 function x264_pixel_sad\name\()_\w\()x\h\()_neon
+    SAD_START_\w \align
+
 .if \w == 16
-    .set r, \h / 2 - 1
+.rept \h / 2 - 1
+    SAD_\w \align
+.endr
 .else
-    .set r, \h - 1
-.endif
-
-    SAD_START_\w \align
-.rept r
+.rept \h - 1
     SAD_\w \align
 .endr
+.endif
 
 .if \w > 8
     vabal.u8    q8,  d4,  d6
diff --git a/common/common.c b/common/common.c
index 6d1d7f0..0dd7af5 100644
--- a/common/common.c
+++ b/common/common.c
@@ -515,7 +515,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
     OPT("psy-rd")
     {
         if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
-            2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) )
+            2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
+            2 == sscanf( value, "%f|%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ))
         { }
         else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) )
         {
@@ -886,7 +887,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
     s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
     s += sprintf( s, " psy=%d", p->analyse.b_psy );
     if( p->analyse.b_psy )
-        s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
+        s += sprintf( s, " psy_rd=%.2f:%.2f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
     s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
     s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
     s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me );
diff --git a/common/common.h b/common/common.h
index 950f48f..e2e8fac 100644
--- a/common/common.h
+++ b/common/common.h
@@ -484,6 +484,7 @@ struct x264_t
         int     b_chroma_me;
         int     b_trellis;
         int     b_noise_reduction;
+        int     b_dct_decimate;
         int     i_psy_rd; /* Psy RD strength--fixed point value*/
         int     i_psy_trellis; /* Psy trellis strength--fixed point value*/
 
@@ -653,11 +654,12 @@ struct x264_t
         int     i_chroma_lambda2_offset;
 
         /* B_direct and weighted prediction */
-        int16_t dist_scale_factor[16][2];
+        int16_t dist_scale_factor_buf[2][16][2];
+        int16_t (*dist_scale_factor)[2];
         int8_t bipred_weight_buf[2][32][4];
         int8_t (*bipred_weight)[4];
         /* maps fref1[0]'s ref indices into the current list0 */
-#define map_col_to_list0(col) h->mb.map_col_to_list0[col+2]
+#define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
         int8_t  map_col_to_list0[18];
         int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
     } mb;
diff --git a/common/frame.c b/common/frame.c
index 40cc78f..d89f5ab 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -472,12 +472,14 @@ static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int a
                 int delta;
                 if( abs( p2 - p0 ) < beta )
                 {
-                    pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
+                    if( tc0[i] )
+                        pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
                     tc++;
                 }
                 if( abs( q2 - q0 ) < beta )
                 {
-                    pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
+                    if( tc0[i] )
+                        pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
                     tc++;
                 }
 
diff --git a/common/frame.h b/common/frame.h
index b1852b3..7c8e2ff 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -48,6 +48,7 @@ typedef struct x264_frame
     uint8_t i_bframes;   /* number of bframes following this nonb in coded order */
     float   f_qp_avg_rc; /* QPs as decided by ratecontrol */
     float   f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
+    int     i_poc_l0ref0; /* poc of first refframe in L0, used to check if direct temporal is possible */
 
     /* YUV buffer */
     int     i_plane;
diff --git a/common/macroblock.c b/common/macroblock.c
index 10f09ac..278659c 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -190,7 +190,8 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
         const int x8 = i8%2;
         const int y8 = i8/2;
         const int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride;
-        const int i_ref = map_col_to_list0(h->fref1[0]->ref[0][i_part_8x8]);
+        const int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8];
+        const int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff);
 
         if( i_ref >= 0 )
         {
@@ -271,6 +272,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, mv[0] );
     x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, mv[1] );
 
+    if( !M64( mv ) )
+        return 1;
+
     if( h->param.i_threads > 1
         && ( mv[0][1] > h->mb.mv_max_spel[1]
           || mv[1][1] > h->mb.mv_max_spel[1] ) )
@@ -1238,6 +1242,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
         if( h->sh.i_type == SLICE_TYPE_B )
         {
             h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced&(i_mb_y&1)];
+            h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[h->mb.b_interlaced&(i_mb_y&1)];
             if( h->param.b_cabac )
             {
                 uint8_t skipbp;
@@ -1477,9 +1482,7 @@ void x264_macroblock_bipred_init( x264_t *h )
                     dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
                 }
 
-                // FIXME: will need this if we ever do temporal MV pred with interlaced
-                if( !h->sh.b_mbaff )
-                    h->mb.dist_scale_factor[i_ref0][i_ref1] = dist_scale_factor;
+                h->mb.dist_scale_factor_buf[field][i_ref0][i_ref1] = dist_scale_factor;
 
                 dist_scale_factor >>= 2;
                 if( h->param.analyse.b_weighted_bipred
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index 29e05f1..62e281a 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -59,7 +59,7 @@ endstruc
 %macro LOAD_GLOBAL 4
 %ifdef PIC
     ; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
-    lea   r11, [%2 GLOBAL]
+    lea   r11, [%2]
     %ifnidn %3, 0
     add   r11, %3
     %endif
diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm
index a713dd6..3350e40 100644
--- a/common/x86/dct-32.asm
+++ b/common/x86/dct-32.asm
@@ -349,7 +349,7 @@ cglobal x264_sub8x8_dct_%1, 3,3
 global x264_sub8x8_dct_%1.skip_prologue
 .skip_prologue:
 %ifnidn %1, sse2
-    mova m7, [hsub_mul GLOBAL]
+    mova m7, [hsub_mul]
 %endif
     LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
     SPILL r0, 1,2
@@ -393,7 +393,7 @@ global x264_sub8x8_dct8_%1.skip_prologue
     LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
     UNSPILL r0, 0
 %else
-    mova m7, [hsub_mul GLOBAL]
+    mova m7, [hsub_mul]
     LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
     SPILL r0, 0,1
     SWAP 1, 7
@@ -441,9 +441,9 @@ global x264_add8x8_idct_sse2.skip_prologue
     SPILL r1, 0
     TRANSPOSE2x4x4W 4,5,6,7,0
     UNSPILL r1, 0
-    paddw m0, [pw_32 GLOBAL]
+    paddw m0, [pw_32]
     IDCT4_1D 0,1,2,3,r1
-    paddw m4, [pw_32 GLOBAL]
+    paddw m4, [pw_32]
     IDCT4_1D 4,5,6,7,r1
     SPILL r1, 6,7
     pxor m7, m7
@@ -466,7 +466,7 @@ global x264_add8x8_idct8_sse2.skip_prologue
     IDCT8_1D   0,1,2,3,4,5,6,7,r1
     SPILL r1, 6
     TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
-    paddw      m0, [pw_32 GLOBAL]
+    paddw      m0, [pw_32]
     SPILL r1, 0
     IDCT8_1D   0,1,2,3,4,5,6,7,r1
     SPILL r1, 6,7
diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
index 9915789..ba7741e 100644
--- a/common/x86/dct-64.asm
+++ b/common/x86/dct-64.asm
@@ -143,7 +143,7 @@ INIT_XMM
 cglobal x264_sub8x8_dct_%1, 3,3,11
     add r2, 4*FDEC_STRIDE
 %ifnidn %1, sse2
-    mova m7, [hsub_mul GLOBAL]
+    mova m7, [hsub_mul]
 %endif
 %ifdef WIN64
     call .skip_prologue
@@ -170,7 +170,7 @@ global x264_sub8x8_dct_%1.skip_prologue
 cglobal x264_sub8x8_dct8_%1, 3,3,11
     add r2, 4*FDEC_STRIDE
 %ifnidn %1, sse2
-    mova m7, [hsub_mul GLOBAL]
+    mova m7, [hsub_mul]
 %endif
 %ifdef WIN64
     call .skip_prologue
@@ -227,7 +227,7 @@ global x264_add8x8_idct8_sse2.skip_prologue
     movdqa  m7, [r1+0x70]
     IDCT8_1D      0,1,2,3,4,5,6,7,8,10
     TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
-    paddw         m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end
+    paddw         m0, [pw_32] ; rounding for the >>6 at the end
     IDCT8_1D      0,1,2,3,4,5,6,7,8,10
     DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
     DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
@@ -265,9 +265,9 @@ global x264_add8x8_idct_sse2.skip_prologue
     TRANSPOSE2x4x4W 0,1,2,3,8
     IDCT4_1D 4,5,6,7,8,10
     TRANSPOSE2x4x4W 4,5,6,7,8
-    paddw m0, [pw_32 GLOBAL]
+    paddw m0, [pw_32]
     IDCT4_1D 0,1,2,3,8,10
-    paddw m4, [pw_32 GLOBAL]
+    paddw m4, [pw_32]
     IDCT4_1D 4,5,6,7,8,10
     DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
     DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index d4a0cae..618433c 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -80,7 +80,7 @@ cglobal x264_dct4x4dc_mmx, 1,1
     movq   m2, [r0+16]
     movq   m1, [r0+ 8]
     movq   m0, [r0+ 0]
-    movq   m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
+    movq   m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
     WALSH4_1D  0,1,2,3,4
     TRANSPOSE4x4W 0,1,2,3,4
     SUMSUB_BADC m1, m0, m3, m2, m4
@@ -123,7 +123,7 @@ cglobal x264_sub4x4_dct_%1, 3,3
     LOAD_DIFF  m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
     LOAD_DIFF  m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
 %else
-    mova m5, [hsub_mul GLOBAL]
+    mova m5, [hsub_mul]
     LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
 %endif
     DCT4_1D 0,1,2,3,4
@@ -151,7 +151,7 @@ cglobal x264_add4x4_idct_mmx, 2,2
     movq  m0, [r1+ 0]
     IDCT4_1D 0,1,2,3,4,5
     TRANSPOSE4x4W 0,1,2,3,4
-    paddw m0, [pw_32 GLOBAL]
+    paddw m0, [pw_32]
     IDCT4_1D 0,1,2,3,4,5
     STORE_DIFF  m0, m4, m7, [r0+0*FDEC_STRIDE]
     STORE_DIFF  m1, m4, m7, [r0+1*FDEC_STRIDE]
@@ -179,7 +179,7 @@ cglobal x264_add4x4_idct_sse4, 2,2,6
     punpckhdq m2, m0
     SWAP 0, 1
 
-    mova      m1, [pw_32_0 GLOBAL]
+    mova      m1, [pw_32_0]
     paddw     m1, m0            ; row1/row0 corrected
     psraw     m0, 1             ; row1>>1/...
     mova      m3, m2            ; row3/row2
@@ -221,7 +221,7 @@ cglobal %1, 3,3,11
     pxor m7, m7
 %else
     add r2, 4*FDEC_STRIDE
-    mova m7, [hsub_mul GLOBAL]
+    mova m7, [hsub_mul]
 %endif
 .skip_prologue:
 %ifdef WIN64
@@ -335,7 +335,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2
     movq      mm0, [r1]
     pxor      mm1, mm1
     add        r0, FDEC_STRIDE*4
-    paddw     mm0, [pw_32 GLOBAL]
+    paddw     mm0, [pw_32]
     psraw     mm0, 6
     psubw     mm1, mm0
     packuswb  mm0, mm0
@@ -354,10 +354,10 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
     movq      xmm0, [r1]
     pxor      xmm1, xmm1
     add         r0, FDEC_STRIDE*4
-    paddw     xmm0, [pw_32 GLOBAL]
+    paddw     xmm0, [pw_32]
     psraw     xmm0, 6
     psubw     xmm1, xmm0
-    movdqa    xmm5, [pb_idctdc_unpack GLOBAL]
+    movdqa    xmm5, [pb_idctdc_unpack]
     packuswb  xmm0, xmm0
     packuswb  xmm1, xmm1
     pshufb    xmm0, xmm5
@@ -393,7 +393,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
 .loop:
     movq      mm0, [r1]
     pxor      mm1, mm1
-    paddw     mm0, [pw_32 GLOBAL]
+    paddw     mm0, [pw_32]
     psraw     mm0, 6
     psubw     mm1, mm0
     packuswb  mm0, mm0
@@ -447,8 +447,8 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2,8
     punpcklwd xmm2, xmm2
     pxor      xmm1, xmm1
     pxor      xmm3, xmm3
-    paddw     xmm0, [pw_32 GLOBAL]
-    paddw     xmm2, [pw_32 GLOBAL]
+    paddw     xmm0, [pw_32]
+    paddw     xmm2, [pw_32]
     psraw     xmm0, 6
     psraw     xmm2, 6
     psubw     xmm1, xmm0
@@ -477,11 +477,11 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
     movdqa    xmm0, [r1]
     add       r1, 16
     pxor      xmm1, xmm1
-    paddw     xmm0, [pw_32 GLOBAL]
+    paddw     xmm0, [pw_32]
     psraw     xmm0, 6
     psubw     xmm1, xmm0
-    movdqa    xmm5, [ pb_idctdc_unpack GLOBAL]
-    movdqa    xmm6, [pb_idctdc_unpack2 GLOBAL]
+    movdqa    xmm5, [ pb_idctdc_unpack]
+    movdqa    xmm6, [pb_idctdc_unpack2]
     packuswb  xmm0, xmm0
     packuswb  xmm1, xmm1
     movdqa    xmm2, xmm0
@@ -815,8 +815,8 @@ cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
 cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
     movdqa    xmm1, [r1+16]
     movdqa    xmm0, [r1]
-    pshufb    xmm1, [pb_scan4frameb GLOBAL]
-    pshufb    xmm0, [pb_scan4framea GLOBAL]
+    pshufb    xmm1, [pb_scan4frameb]
+    pshufb    xmm0, [pb_scan4framea]
     movdqa    xmm2, xmm1
     psrldq    xmm1, 6
     palignr   xmm2, xmm0, 6
@@ -963,9 +963,9 @@ cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
     punpcklqdq xmm0, xmm2
     punpcklqdq xmm4, xmm6
 %ifidn %2, frame
-    movdqa    xmm7, [pb_sub4frame GLOBAL]
+    movdqa    xmm7, [pb_sub4frame]
 %else
-    movdqa    xmm7, [pb_sub4field GLOBAL]
+    movdqa    xmm7, [pb_sub4field]
 %endif
     pshufb    xmm0, xmm7
     pshufb    xmm4, xmm7
@@ -980,7 +980,7 @@ cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
     psubw     xmm1, xmm5
 %ifidn %1, ac
     movd       r2d, xmm0
-    pand      xmm0, [pb_subacmask GLOBAL]
+    pand      xmm0, [pb_subacmask]
 %endif
     movdqa    [r0], xmm0
     pxor      xmm2, xmm2
@@ -1039,7 +1039,7 @@ cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
     packsswb m5, m5
     pxor     m0, m0
     pcmpeqb  m5, m0
-    paddb    m5, [pb_1 GLOBAL]
+    paddb    m5, [pb_1]
     movd    r0d, m5
     mov  [r2+0], r0w
     shr     r0d, 16
@@ -1085,7 +1085,7 @@ cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
     packsswb m2, m2
     packsswb m2, m2
     pcmpeqb  m5, m2
-    paddb    m5, [pb_1 GLOBAL]
+    paddb    m5, [pb_1]
     movd    r0d, m5
     mov  [r2+0], r0w
     shr     r0d, 16
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index 75b308f..00d0418 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -233,19 +233,19 @@ SECTION .text
 ; clobbers: m0,3-6
 %macro DEBLOCK_P0_Q0 0
     mova    m5, m1
-    pxor    m5, m2           ; p0^q0
-    pand    m5, [pb_01 GLOBAL] ; (p0^q0)&1
+    pxor    m5, m2       ; p0^q0
+    pand    m5, [pb_01]  ; (p0^q0)&1
     pcmpeqb m4, m4
     pxor    m3, m4
-    pavgb   m3, m0           ; (p1 - q1 + 256)>>1
-    pavgb   m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
+    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
+    pavgb   m3, [pb_03]  ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
     pxor    m4, m1
-    pavgb   m4, m2           ; (q0 - p0 + 256)>>1
+    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
     pavgb   m3, m5
-    paddusb m3, m4           ; d+128+33
-    mova    m6, [pb_a1 GLOBAL]
+    paddusb m3, m4       ; d+128+33
+    mova    m6, [pb_a1]
     psubusb m6, m3
-    psubusb m3, [pb_a1 GLOBAL]
+    psubusb m3, [pb_a1]
     pminub  m6, m7
     pminub  m3, m7
     psubusb m1, m6
@@ -261,10 +261,10 @@ SECTION .text
 %macro LUMA_Q1 6
     mova    %6, m1
     pavgb   %6, m2
-    pavgb   %2, %6             ; avg(p2,avg(p0,q0))
+    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
     pxor    %6, %3
-    pand    %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
-    psubusb %2, %6             ; (p2+((p0+q0+1)>>1))>>1
+    pand    %6, [pb_01]  ; (p2^avg(p0,q0))&1
+    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
     mova    %6, %1
     psubusb %6, %5
     paddusb %5, %1
@@ -614,8 +614,8 @@ DEBLOCK_LUMA sse2, v, 16
     %define mask0 spill(2)
     %define mask1p spill(3)
     %define mask1q spill(4)
-    %define mpb_00 [pb_00 GLOBAL]
-    %define mpb_01 [pb_01 GLOBAL]
+    %define mpb_00 [pb_00]
+    %define mpb_01 [pb_01]
 %endif
 
 ;-----------------------------------------------------------------------------
@@ -639,7 +639,7 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
     mova    q1, [r0+r1]
 %ifdef ARCH_X86_64
     pxor    mpb_00, mpb_00
-    mova    mpb_01, [pb_01 GLOBAL]
+    mova    mpb_01, [pb_01]
     LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
     SWAP    7, 12 ; m12=mask0
     pavgb   t5, mpb_00
@@ -658,8 +658,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
     LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
     mova    m4, t5
     mova    mask0, m7
-    pavgb   m4, [pb_00 GLOBAL]
-    pavgb   m4, [pb_01 GLOBAL] ; alpha/4+1
+    pavgb   m4, [pb_00]
+    pavgb   m4, [pb_01] ; alpha/4+1
     DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
     pand    m6, mask0
     DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
@@ -835,7 +835,7 @@ chroma_inter_body_mmxext:
 %macro CHROMA_INTRA_P0 3
     movq    m4, %1
     pxor    m4, %3
-    pand    m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1
+    pand    m4, [pb_01] ; m4 = (p0^q1)&1
     pavgb   %1, %3
     psubusb %1, m4
     pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index f486a8d..9783066 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -89,9 +89,9 @@ SECTION .text
 %macro BIWEIGHT_START_MMX 0
     movd    m2, r6m
     SPLATW  m2, m2   ; weight_dst
-    mova    m3, [pw_64 GLOBAL]
+    mova    m3, [pw_64]
     psubw   m3, m2   ; weight_src
-    mova    m4, [pw_32 GLOBAL] ; rounding
+    mova    m4, [pw_32] ; rounding
     pxor    m5, m5
 %endmacro
 
@@ -111,7 +111,7 @@ SECTION .text
     shl    t7d, 8
     add    t6d, t7d
     movd    m3, t6d
-    mova    m4, [pw_32 GLOBAL]
+    mova    m4, [pw_32]
     SPLATW  m3, m3   ; weight_dst,src
 %endmacro
 
@@ -641,7 +641,7 @@ AVG2_W20 sse2_misalign
 %macro INIT_SHIFT 2
     and    eax, 7
     shl    eax, 3
-    movd   %1, [sw_64 GLOBAL]
+    movd   %1, [sw_64]
     movd   %2, eax
     psubw  %1, %2
 %endmacro
@@ -778,10 +778,10 @@ cglobal x264_pixel_avg2_w16_cache64_ssse3
     shl    r6, 4         ;jump = (offset + align*2)*48
 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
 %ifdef PIC
-    lea   r11, [avg_w16_addr GLOBAL]
+    lea   r11, [avg_w16_addr]
     add    r6, r11
 %else
-    lea    r6, [avg_w16_addr + r6 GLOBAL]
+    lea    r6, [avg_w16_addr + r6]
 %endif
 %ifdef UNIX64
     jmp    r6
@@ -1007,7 +1007,7 @@ cglobal x264_mc_chroma_%1
     SPLATW     m5, m5        ; m5 = dx
     SPLATW     m6, m6        ; m6 = dy
 
-    mova       m4, [pw_8 GLOBAL]
+    mova       m4, [pw_8]
     mova       m0, m4
     psubw      m4, m5        ; m4 = 8-dx
     psubw      m0, m6        ; m0 = 8-dy
@@ -1042,7 +1042,7 @@ cglobal x264_mc_chroma_%1
     punpcklbw  m2, m3
     punpcklbw  m1, m3
 
-    paddw      m0, [pw_32 GLOBAL]
+    paddw      m0, [pw_32]
 
     pmullw     m2, m5        ; line * cB
     pmullw     m1, m7        ; line * cD
@@ -1084,9 +1084,9 @@ cglobal x264_mc_chroma_%1
     movd       m6, r4d
     mov       r5d, 1
 .mc1d:
-    mova       m5, [pw_8 GLOBAL]
+    mova       m5, [pw_8]
     SPLATW     m6, m6
-    mova       m7, [pw_4 GLOBAL]
+    mova       m7, [pw_4]
     psubw      m5, m6
     movifnidn r0,  r0mp
     movifnidn r1d, r1m
@@ -1166,7 +1166,7 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
     imul      r4d, t0d ; (x*255+8)*(8-y)
     cmp dword r6m, 4
     jg .width8
-    mova       m5, [pw_32 GLOBAL]
+    mova       m5, [pw_32]
     movd       m6, r5d
     movd       m7, r4d
     movifnidn  r0, r0mp
@@ -1178,10 +1178,10 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
     and        r2, ~3
     and        r5, 3
 %ifdef PIC
-    lea       r11, [ch_shuffle GLOBAL]
+    lea       r11, [ch_shuffle]
     movu       m5, [r11 + r5*2]
 %else
-    movu       m5, [ch_shuffle + r5*2 GLOBAL]
+    movu       m5, [ch_shuffle + r5*2]
 %endif
     movu       m0, [r2]
     pshufb     m0, m5
@@ -1197,8 +1197,8 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
     pmaddubsw  m1, m6
     pmaddubsw  m2, m7
     pmaddubsw  m3, m6
-    paddw      m0, [pw_32 GLOBAL]
-    paddw      m2, [pw_32 GLOBAL]
+    paddw      m0, [pw_32]
+    paddw      m2, [pw_32]
     paddw      m1, m0
     paddw      m3, m2
     mova       m0, m4
@@ -1228,7 +1228,7 @@ INIT_XMM
     cmp        r5, 0x38
     jge .split
 %endif
-    mova       m5, [pw_32 GLOBAL]
+    mova       m5, [pw_32]
     movh       m0, [r2]
     movh       m1, [r2+1]
     punpcklbw  m0, m1
@@ -1265,18 +1265,18 @@ INIT_XMM
     and        r2, ~7
     and        r5, 7
 %ifdef PIC
-    lea       r11, [ch_shuffle GLOBAL]
+    lea       r11, [ch_shuffle]
     movu       m5, [r11 + r5*2]
 %else
-    movu       m5, [ch_shuffle + r5*2 GLOBAL]
+    movu       m5, [ch_shuffle + r5*2]
 %endif
     movu       m0, [r2]
     pshufb     m0, m5
 %ifdef ARCH_X86_64
-    mova       m8, [pw_32 GLOBAL]
+    mova       m8, [pw_32]
     %define round m8
 %else
-    %define round [pw_32 GLOBAL]
+    %define round [pw_32]
 %endif
 .splitloop8:
     movu       m1, [r2+r3]
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 245c09f..f2e69c0 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -125,7 +125,7 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
 %ifnidn %1, ssse3
     pxor m0, m0
 %else
-    mova m0, [filt_mul51 GLOBAL]
+    mova m0, [filt_mul51]
 %endif
 .loop:
 %ifidn %1, ssse3
@@ -142,8 +142,8 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
     pmaddubsw m4, m0
     pmaddubsw m2, m0
     pmaddubsw m5, m0
-    pmaddubsw m3, [filt_mul20 GLOBAL]
-    pmaddubsw m6, [filt_mul20 GLOBAL]
+    pmaddubsw m3, [filt_mul20]
+    pmaddubsw m6, [filt_mul20]
     paddw  m1, m2
     paddw  m4, m5
     paddw  m1, m3
@@ -155,7 +155,7 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
     LOAD_ADD   m6,     [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
     FILT_V2
 %endif
-    mova      m7, [pw_16 GLOBAL]
+    mova      m7, [pw_16]
     mova      [r2+r4*2], m1
     mova      [r2+r4*2+mmsize], m4
     paddw     m1, m7
@@ -180,7 +180,7 @@ cglobal x264_hpel_filter_c_mmxext, 3,3
     lea r1, [r1+r2*2]
     neg r2
     %define src r1+r2*2
-    movq m7, [pw_32 GLOBAL]
+    movq m7, [pw_32]
 .loop:
     movq   m1, [src-4]
     movq   m2, [src-2]
@@ -237,7 +237,7 @@ cglobal x264_hpel_filter_h_mmxext, 3,3
     punpcklbw  m7, m0
     punpcklbw  m6, m0
     paddw      m6, m7 ; a1
-    movq       m7, [pw_1 GLOBAL]
+    movq       m7, [pw_1]
     FILT_H2 m1, m2, m3, m4, m5, m6
     FILT_PACK m1, m4, 1
     movntq     [r0+r2], m1
@@ -257,13 +257,13 @@ cglobal x264_hpel_filter_c_%1, 3,3,9
     neg r2
     %define src r1+r2*2
 %ifidn %1, ssse3
-    mova    m7, [pw_32 GLOBAL]
+    mova    m7, [pw_32]
     %define tpw_32 m7
 %elifdef ARCH_X86_64
-    mova    m8, [pw_32 GLOBAL]
+    mova    m8, [pw_32]
     %define tpw_32 m8
 %else
-    %define tpw_32 [pw_32 GLOBAL]
+    %define tpw_32 [pw_32]
 %endif
 .loop:
 %ifidn %1,sse2_misalign
@@ -340,7 +340,7 @@ cglobal x264_hpel_filter_h_sse2, 3,3,8
     punpcklbw  m6, m0
     punpcklbw  m7, m0
     paddw      m6, m7 ; c1
-    mova       m7, [pw_1 GLOBAL] ; FIXME xmm8
+    mova       m7, [pw_1] ; FIXME xmm8
     FILT_H2 m1, m2, m3, m4, m5, m6
     FILT_PACK m1, m4, 1
     movntdq    [r0+r2], m1
@@ -362,7 +362,7 @@ cglobal x264_hpel_filter_h_ssse3, 3,3
     punpcklbw m1, m0         ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
     movh m2, [src]
     punpcklbw m2, m0
-    mova       m7, [pw_1 GLOBAL]
+    mova       m7, [pw_1]
 .loop:
     movh       m3, [src+8]
     punpcklbw  m3, m0
@@ -436,7 +436,7 @@ HPEL_V ssse3
     mova m3, [r1]
     mova %4, [r1+r2]
     mova m0, [r1+r2*2]
-    mova %2, [filt_mul51 GLOBAL]
+    mova %2, [filt_mul51]
     mova m4, m1
     punpcklbw m1, m2
     punpckhbw m4, m2
@@ -452,8 +452,8 @@ HPEL_V ssse3
     pmaddubsw m4, %2
     pmaddubsw m0, %2
     pmaddubsw m2, %2
-    pmaddubsw m3, [filt_mul20 GLOBAL]
-    pmaddubsw %1, [filt_mul20 GLOBAL]
+    pmaddubsw m3, [filt_mul20]
+    pmaddubsw %1, [filt_mul20]
     psrlw     %3, 8
     psrlw     %4, 8
     paddw m1, m0
@@ -1096,7 +1096,7 @@ cglobal x264_mbtree_propagate_cost_sse2, 6,6
     add r4, r5
     neg r5
     pxor      xmm5, xmm5
-    movdqa    xmm4, [pd_128 GLOBAL]
+    movdqa    xmm4, [pd_128]
 .loop:
     movq      xmm2, [r2+r5] ; intra
     movq      xmm0, [r4+r5] ; invq
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index d94daaf..46b4557 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -59,7 +59,7 @@ SECTION .text
 %endmacro
 
 %macro HADDW 2
-    pmaddwd %1, [pw_1 GLOBAL]
+    pmaddwd %1, [pw_1]
     HADDD   %1, %2
 %endmacro
 
@@ -244,9 +244,9 @@ cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
 %endif
 
 %ifidn %3, ssse3
-    mova    m7, [hsub_mul GLOBAL]
+    mova    m7, [hsub_mul]
 %elifidn %3, sse2
-    mova    m7, [pw_00ff GLOBAL]
+    mova    m7, [pw_00ff]
 %elif %1 >= mmsize
     pxor    m7, m7
 %endif
@@ -310,7 +310,7 @@ SSD  4,  8, ssse3
     pxor  m5, m5    ; sum
     pxor  m6, m6    ; sum squared
 %if %1
-    mova  m7, [pw_00ff GLOBAL]
+    mova  m7, [pw_00ff]
 %else
     pxor  m7, m7    ; zero
 %endif
@@ -482,7 +482,7 @@ cglobal x264_pixel_var2_8x8_sse2, 5,6,8
 cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
     pxor      m5, m5    ; sum
     pxor      m6, m6    ; sum squared
-    mova      m7, [hsub_mul GLOBAL]
+    mova      m7, [hsub_mul]
     mov      r5d, 2
 .loop:
     movq      m0, [r0]
@@ -775,7 +775,7 @@ cglobal x264_pixel_satd_4x4_mmxext, 4,6
 
 %macro SATD_START_SSE2 3
 %ifnidn %1, sse2
-    mova    %3, [hmul_8p GLOBAL]
+    mova    %3, [hmul_8p]
 %endif
     lea     r4, [3*r1]
     lea     r5, [3*r3]
@@ -815,7 +815,7 @@ INIT_XMM
 %ifnidn %1, sse2
 cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
     SATD_START_MMX
-    mova m4, [hmul_4p GLOBAL]
+    mova m4, [hmul_4p]
     LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
     LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
     LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
@@ -832,7 +832,7 @@ cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
 cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
     SATD_START_MMX
 %ifnidn %1, sse2
-    mova m7, [hmul_4p GLOBAL]
+    mova m7, [hmul_4p]
 %endif
     movd m4, [r2]
     movd m5, [r2+r3]
@@ -889,14 +889,14 @@ cglobal x264_pixel_satd_16x4_internal_%1
 cglobal x264_pixel_satd_16x8_%1, 4,6,12
     SATD_START_SSE2 %1, m10, m7
 %ifidn %1, sse2
-    mova m7, [pw_00ff GLOBAL]
+    mova m7, [pw_00ff]
 %endif
     jmp x264_pixel_satd_16x8_internal_%1
 
 cglobal x264_pixel_satd_16x16_%1, 4,6,12
     SATD_START_SSE2 %1, m10, m7
 %ifidn %1, sse2
-    mova m7, [pw_00ff GLOBAL]
+    mova m7, [pw_00ff]
 %endif
     call x264_pixel_satd_16x4_internal_%1
     call x264_pixel_satd_16x4_internal_%1
@@ -977,7 +977,7 @@ cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
     lea  r4, [3*r1]
     lea  r5, [3*r3]
 %ifnidn %1, sse2
-    mova m7, [hmul_8p GLOBAL]
+    mova m7, [hmul_8p]
 %endif
     call x264_pixel_sa8d_8x8_internal_%1
     HADDW m0, m1
@@ -990,7 +990,7 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
     lea  r4, [3*r1]
     lea  r5, [3*r3]
 %ifnidn %1, sse2
-    mova m7, [hmul_8p GLOBAL]
+    mova m7, [hmul_8p]
 %endif
     call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
     add  r2, 8
@@ -1029,7 +1029,7 @@ cglobal x264_pixel_sa8d_8x8_internal_%1
     paddw m0, m1
     HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
 %else ; non-sse2
-    mova m7, [hmul_8p GLOBAL]
+    mova m7, [hmul_8p]
     LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
     ; could do first HADAMARD4_V here to save spilling later
     ; surprisingly, not a win on conroe or even p4
@@ -1221,7 +1221,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
     paddusw     m2,  m0
 
     ; 3x HADDW
-    movdqa      m7,  [pw_1 GLOBAL]
+    movdqa      m7,  [pw_1]
     pmaddwd     m2,  m7
     pmaddwd     m14, m7
     pmaddwd     m15, m7
@@ -1650,7 +1650,7 @@ cglobal x264_hadamard_ac_2x2max_mmxext
     ret
 
 cglobal x264_hadamard_ac_8x8_mmxext
-    mova      m6, [mask_ac4 GLOBAL]
+    mova      m6, [mask_ac4]
     pxor      m7, m7
     call x264_hadamard_ac_4x4_mmxext
     add       r0, 4
@@ -1727,7 +1727,7 @@ cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
     mova    m3, m0
     paddusw m1, [rsp+0x38]
     pxor    m3, m2
-    pand    m3, [pw_1 GLOBAL]
+    pand    m3, [pw_1]
     pavgw   m0, m2
     psubusw m0, m3
     HADDUW  m0, m2
@@ -1791,7 +1791,7 @@ cglobal x264_hadamard_ac_8x8_%1
 %endif
 %ifnidn %1, sse2
     ;LOAD_INC loads sumsubs
-    mova      m7, [hmul_8p GLOBAL]
+    mova      m7, [hmul_8p]
 %else
     ;LOAD_INC only unpacks to words
     pxor      m7, m7
@@ -1834,9 +1834,9 @@ cglobal x264_hadamard_ac_8x8_%1
     paddw     m1, m2
     SUMSUB_BA m0, m4; m2
 %ifnidn %1, sse2
-    pand      m1, [mask_ac4b GLOBAL]
+    pand      m1, [mask_ac4b]
 %else
-    pand      m1, [mask_ac4 GLOBAL]
+    pand      m1, [mask_ac4]
 %endif
     ABS_MOV   m2, spill0
     paddw     m1, m3
@@ -1878,7 +1878,7 @@ cglobal x264_hadamard_ac_8x8_%1
     paddw m2, m1
     paddw m2, m2
     ABS1      m4, m7
-    pand      m0, [mask_ac8 GLOBAL]
+    pand      m0, [mask_ac8]
     ABS1      m0, m7
     paddw m2, m4
     paddw m0, m2
@@ -2041,7 +2041,7 @@ cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
     SSIM_ITER 3
     ; PHADDW m1, m2
     ; PHADDD m3, m4
-    movdqa    m7, [pw_1 GLOBAL]
+    movdqa    m7, [pw_1]
     pshufd    m5, m3, 0xb1
     pmaddwd   m1, m7
     pmaddwd   m2, m7
@@ -2086,8 +2086,8 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3,7
     paddd     m1, m2
     paddd     m2, m3
     paddd     m3, m4
-    movdqa    m5, [ssim_c1 GLOBAL]
-    movdqa    m6, [ssim_c2 GLOBAL]
+    movdqa    m5, [ssim_c1]
+    movdqa    m6, [ssim_c2]
     TRANSPOSE4x4D  0, 1, 2, 3, 4
 
 ;   s1=m0, s2=m1, ss=m2, s12=m3
@@ -2117,10 +2117,10 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3,7
     je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
     neg       r2
 %ifdef PIC
-    lea       r3, [mask_ff + 16 GLOBAL]
+    lea       r3, [mask_ff + 16]
     movdqu    m1, [r3 + r2*4]
 %else
-    movdqu    m1, [mask_ff + r2*4 + 16 GLOBAL]
+    movdqu    m1, [mask_ff + r2*4 + 16]
 %endif
     pand      m4, m1
 .skip:
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 808aa31..4d03f8f 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -99,7 +99,7 @@ SECTION .text
     pavgb       %2, %3
     pxor        %3, %5
     mov%6       %1, %4
-    pand        %3, [pb_1 GLOBAL]
+    pand        %3, [pb_1]
     psubusb     %2, %3
     pavgb       %1, %2
 %endmacro
@@ -466,7 +466,7 @@ cglobal predict_8x8_dc_mmxext, 2,2
     pxor        mm1, mm1
     psadbw      mm0, [r1+7]
     psadbw      mm1, [r1+16]
-    paddw       mm0, [pw_8 GLOBAL]
+    paddw       mm0, [pw_8]
     paddw       mm0, mm1
     psrlw       mm0, 4
     pshufw      mm0, mm0, 0
@@ -481,7 +481,7 @@ cglobal predict_8x8_dc_mmxext, 2,2
 cglobal %1, 2,2
     pxor        mm0, mm0
     psadbw      mm0, [r1+%2]
-    paddw       mm0, [pw_4 GLOBAL]
+    paddw       mm0, [pw_4]
     psrlw       mm0, 3
     pshufw      mm0, mm0, 0
     packuswb    mm0, mm0
@@ -643,7 +643,7 @@ cglobal predict_8x8_vr_core_mmxext, 2,2
 cglobal predict_8x8c_p_core_mmxext, 1,2
     LOAD_PLANE_ARGS
     movq        mm1, mm2
-    pmullw      mm2, [pw_3210 GLOBAL]
+    pmullw      mm2, [pw_3210]
     psllw       mm1, 2
     paddsw      mm0, mm2        ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
     paddsw      mm1, mm0        ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
@@ -672,7 +672,7 @@ cglobal predict_16x16_p_core_mmxext, 1,2
     LOAD_PLANE_ARGS
     movq        mm5, mm2
     movq        mm1, mm2
-    pmullw      mm5, [pw_3210 GLOBAL]
+    pmullw      mm5, [pw_3210]
     psllw       mm2, 3
     psllw       mm1, 2
     movq        mm3, mm2
@@ -786,7 +786,7 @@ cglobal predict_8x8_vl_sse2, 2,2
 ;-----------------------------------------------------------------------------
 cglobal predict_8x8_vr_sse2, 2,2,7
     movdqu      xmm0, [r1+8]
-    movdqa      xmm6, [pw_ff00 GLOBAL]
+    movdqa      xmm6, [pw_ff00]
     add         r0, 4*FDEC_STRIDE
     movdqa      xmm1, xmm0
     movdqa      xmm2, xmm0
@@ -910,7 +910,7 @@ cglobal predict_8x8_hu_%1, 2,2
     add        r0, 4*FDEC_STRIDE
 %ifidn %1, ssse3
     movq      mm5, [r1+7]
-    movq      mm6, [pb_reverse GLOBAL]
+    movq      mm6, [pb_reverse]
     movq      mm1, mm5
     movq      mm2, mm5
     movq      mm3, mm5
@@ -979,7 +979,7 @@ cglobal predict_8x8c_v_mmx, 1,1
 %macro PRED_8x8C_H 1
 cglobal predict_8x8c_h_%1, 1,1
 %ifidn %1, ssse3
-    mova   m1, [pb_3 GLOBAL]
+    mova   m1, [pb_3]
 %endif
 %assign n 0
 %rep 8
@@ -1018,7 +1018,7 @@ cglobal predict_8x8c_dc_core_mmxext, 1,1
     pshufw      mm2, r2m, 0
 %endif
     psrlw       mm0, 3
-    paddw       mm1, [pw_2 GLOBAL]
+    paddw       mm1, [pw_2]
     movq        mm3, mm2
     pshufw      mm1, mm1, 0
     pshufw      mm0, mm0, 0     ; dc0 (w)
@@ -1065,7 +1065,7 @@ cglobal predict_8x8c_p_core_sse2, 1,1
     punpcklqdq  xmm0, xmm0
     punpcklqdq  xmm2, xmm2
     punpcklqdq  xmm4, xmm4
-    pmullw      xmm2, [pw_76543210 GLOBAL]
+    pmullw      xmm2, [pw_76543210]
     paddsw      xmm0, xmm2        ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
     movdqa      xmm3, xmm0
     paddsw      xmm3, xmm4
@@ -1107,7 +1107,7 @@ cglobal predict_16x16_p_core_sse2, 1,2,8
     punpcklqdq  xmm1, xmm1
     punpcklqdq  xmm2, xmm2
     movdqa      xmm3, xmm1
-    pmullw      xmm3, [pw_76543210 GLOBAL]
+    pmullw      xmm3, [pw_76543210]
     psllw       xmm1, 3
     paddsw      xmm0, xmm3  ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
     paddsw      xmm1, xmm0  ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
@@ -1162,7 +1162,7 @@ cglobal predict_16x16_v_sse2, 1,1
 cglobal predict_16x16_h_%1, 1,2
     mov r1, FDEC_STRIDE*12
 %ifidn %1, ssse3
-    mova   m1, [pb_3 GLOBAL]
+    mova   m1, [pb_3]
 %endif
 .vloop:
 %assign n 0
@@ -1214,7 +1214,7 @@ cglobal predict_16x16_dc_core_mmxext, 1,2
     REP_RET
 
 cglobal predict_16x16_dc_top_mmxext, 1,2
-    PRED16x16_DC [pw_8 GLOBAL], 4
+    PRED16x16_DC [pw_8], 4
     REP_RET
 
 cglobal predict_16x16_dc_left_core_mmxext, 1,1
@@ -1247,7 +1247,7 @@ cglobal predict_16x16_dc_core_sse2, 1,1
     RET
 
 cglobal predict_16x16_dc_top_sse2, 1,1
-    PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
+    PRED16x16_DC_SSE2 [pw_8], 4
     RET
 
 cglobal predict_16x16_dc_left_core_sse2, 1,1
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 52e121a..3edd244 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -86,7 +86,7 @@ SECTION .text
 %endmacro
 
 %macro QUANT_DC_START_SSSE3 0
-    movdqa     m5, [pb_01 GLOBAL]
+    movdqa     m5, [pb_01]
     movd       m6, r1m     ; mf
     movd       m7, r2m     ; bias
     pshufb     m6, m5
@@ -361,7 +361,7 @@ cglobal x264_dequant_%2x%2_%1, 0,3
 .rshift32:
     neg   t0d
     movd  m2, t0d
-    mova  m3, [pd_1 GLOBAL]
+    mova  m3, [pd_1]
     pxor  m4, m4
     pslld m3, m2
     psrld m3, 1
@@ -381,10 +381,10 @@ cglobal x264_dequant_%2x%2_flat16_%1, 0,3
     sub  t2d, t1d   ; i_mf = i_qp % 6
     shl  t2d, %3
 %ifdef PIC
-    lea  r1, [dequant%2_scale GLOBAL]
+    lea  r1, [dequant%2_scale]
     add  r1, t2
 %else
-    lea  r1, [dequant%2_scale + t2 GLOBAL]
+    lea  r1, [dequant%2_scale + t2]
 %endif
     movifnidn r0, r0mp
     movd m4, t0d
@@ -446,7 +446,7 @@ cglobal x264_dequant_4x4dc_%1, 0,3
 .rshift32:
     neg   t0d
     movd  m3, t0d
-    mova  m4, [pw_1 GLOBAL]
+    mova  m4, [pw_1]
     mova  m5, m4
     pslld m4, m3
     psrld m4, 1
@@ -588,15 +588,15 @@ cextern x264_decimate_table8
 ;This is not true for score64.
 cglobal x264_decimate_score%1_%2, 1,3
 %ifdef PIC
-    lea r10, [x264_decimate_table4 GLOBAL]
-    lea r11, [decimate_mask_table4 GLOBAL]
+    lea r10, [x264_decimate_table4]
+    lea r11, [decimate_mask_table4]
     %define table r10
     %define mask_table r11
 %else
     %define table x264_decimate_table4
     %define mask_table decimate_mask_table4
 %endif
-    DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
+    DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
     xor   edx, 0xffff
     je   .ret
     test  eax, eax
@@ -640,12 +640,12 @@ DECIMATE4x4 16, ssse3
 %ifdef ARCH_X86_64
 cglobal x264_decimate_score64_%1, 1,4
 %ifdef PIC
-    lea r10, [x264_decimate_table8 GLOBAL]
+    lea r10, [x264_decimate_table8]
     %define table r10
 %else
     %define table x264_decimate_table8
 %endif
-    mova  m5, [pb_1 GLOBAL]
+    mova  m5, [pb_1]
     DECIMATE_MASK r1d, eax, r0, m5, %1, null
     test  eax, eax
     jne  .ret9
@@ -681,7 +681,7 @@ cglobal x264_decimate_score64_%1, 1,6
 %else
 cglobal x264_decimate_score64_%1, 1,5
 %endif
-    mova  m7, [pb_1 GLOBAL]
+    mova  m7, [pb_1]
     DECIMATE_MASK r3, r2, r0, m7, %1, r5
     test  r2, r2
     jne  .ret9
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 342a984..6db8abf 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -351,7 +351,7 @@ cglobal x264_intra_sad_x3_8x8_mmxext, 3,3
     psadbw    m0, m7
     psadbw    m1, m6
     paddw     m0, m1
-    paddw     m0, [pw_8 GLOBAL]
+    paddw     m0, [pw_8]
     psrlw     m0, 4
     punpcklbw m0, m0
     pshufw    m0, m0, 0x0 ;DC prediction
@@ -411,7 +411,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
     movq        m6, [r1 - FDEC_STRIDE]
     add         r1, FDEC_STRIDE*4
 %ifidn %1,ssse3
-    movq        m7, [pb_3 GLOBAL]
+    movq        m7, [pb_3]
 %endif
     INTRA_SAD_HV_ITER 0, %1
     INTRA_SAD_HV_ITER 2, %1
@@ -450,7 +450,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
     pavgw       m0, m7 ; s0+s2, s1, s3, s1+s3
 %ifidn %1, ssse3
     movq2dq   xmm0, m0
-    pshufb    xmm0, [pb_shuf8x8c GLOBAL]
+    pshufb    xmm0, [pb_shuf8x8c]
     movq      xmm1, [r0+FENC_STRIDE*0]
     movq      xmm2, [r0+FENC_STRIDE*1]
     movq      xmm3, [r0+FENC_STRIDE*2]
@@ -522,7 +522,7 @@ cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
     paddw   mm0, mm1
     movd    r3d, mm0
 %ifidn %1, ssse3
-    mova  m1, [pb_3 GLOBAL]
+    mova  m1, [pb_3]
 %endif
 %assign x 0
 %rep 16
@@ -1301,10 +1301,10 @@ cglobal x264_pixel_sad_16x%2_cache64_%1
 %endif
 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
 %ifdef PIC
-    lea     r5, [sad_w16_addr GLOBAL]
+    lea     r5, [sad_w16_addr]
     add     r5, r4
 %else
-    lea     r5, [sad_w16_addr + r4 GLOBAL]
+    lea     r5, [sad_w16_addr + r4]
 %endif
     and     r2, ~15
     mov     r4d, %2/2
@@ -1323,7 +1323,7 @@ cglobal x264_pixel_sad_16x%2_cache64_%1
     jle x264_pixel_sad_%1x%2_mmxext
     and    eax, 7
     shl    eax, 3
-    movd   mm6, [sw_64 GLOBAL]
+    movd   mm6, [sw_64]
     movd   mm7, eax
     psubw  mm6, mm7
     PROLOGUE 4,5
diff --git a/common/x86/util.h b/common/x86/util.h
index efc700a..c8bcf4b 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -45,8 +45,9 @@ static inline void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b,
 #define x264_predictor_difference x264_predictor_difference_mmxext
 static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
 {
-    int sum = 0;
-    uint16_t output[4];
+    int sum;
+    static const uint64_t pw_1 = 0x0001000100010001ULL;
+
     asm(
         "pxor    %%mm4, %%mm4 \n"
         "test    $1, %1       \n"
@@ -56,7 +57,7 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
         "psubw   %%mm3, %%mm0 \n"
         "jmp 2f               \n"
         "3:                   \n"
-        "sub     $1,    %1    \n"
+        "dec     %1           \n"
         "1:                   \n"
         "movq    -8(%2,%1,4), %%mm0 \n"
         "psubw   -4(%2,%1,4), %%mm0 \n"
@@ -67,11 +68,13 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
         "pmaxsw  %%mm2, %%mm0 \n"
         "paddusw %%mm0, %%mm4 \n"
         "jg 1b                \n"
-        "movq    %%mm4, %0    \n"
-        :"=m"(output), "+r"(i_mvc)
-        :"r"(mvc), "m"(M64( mvc ))
+        "pmaddwd %4, %%mm4    \n"
+        "pshufw $14, %%mm4, %%mm0 \n"
+        "paddd   %%mm0, %%mm4 \n"
+        "movd    %%mm4, %0    \n"
+        :"=r"(sum), "+r"(i_mvc)
+        :"r"(mvc), "m"(M64( mvc )), "m"(pw_1)
     );
-    sum += output[0] + output[1] + output[2] + output[3];
     return sum;
 }
 #define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 2a91084..ee3eca9 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -65,28 +65,16 @@
     %endif
 %endmacro
 
-; PIC support macros.
-; x86_64 can't fit 64bit address literals in most instruction types,
-; so shared objects (under the assumption that they might be anywhere
-; in memory) must use an address mode that does fit.
-; So all accesses to global variables must use this macro, e.g.
-;     mov eax, [foo GLOBAL]
-; instead of
-;     mov eax, [foo]
-;
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-
 %ifdef WIN64
     %define PIC
 %elifndef ARCH_X86_64
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
     %undef PIC
 %endif
 %ifdef PIC
-    %define GLOBAL wrt rip
-%else
-    %define GLOBAL
+    default rel
 %endif
 
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index b822688..d70bb0e 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -239,10 +239,10 @@
 ; %3/%4: source regs
 ; %5/%6: tmp regs
 %ifidn %1, d
-%define mask [mask_10 GLOBAL]
+%define mask [mask_10]
 %define shift 16
 %elifidn %1, q
-%define mask [mask_1100 GLOBAL]
+%define mask [mask_1100]
 %define shift 32
 %endif
 %if %0==6 ; less dependency if we have two tmp
diff --git a/configure b/configure
index b254383..25f5458 100755
--- a/configure
+++ b/configure
@@ -23,6 +23,7 @@ echo "  --extra-cflags=ECFLAGS   add ECFLAGS to CFLAGS"
 echo "  --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS"
 echo "  --host=HOST              build programs to run on HOST"
 echo "  --cross-prefix=PREFIX    use PREFIX for compilation tools"
+echo "  --sysroot=SYSROOT        root of cross-build tree"
 echo ""
 exit 1
 fi
@@ -223,6 +224,10 @@ for opt do
         --cross-prefix=*)
             cross_prefix="${opt#--cross-prefix=}"
             ;;
+        --sysroot=*)
+            CFLAGS="$CFLAGS --sysroot=${opt#--sysroot=}"
+            LDFLAGS="$LDFLAGS --sysroot=${opt#--sysroot=}"
+            ;;
         *)
             echo "Unknown option $opt, ignored"
             ;;
@@ -367,7 +372,17 @@ case $host_cpu in
     ;;
   arm*)
     ARCH="ARM"
-    AS="${AS-${cross_prefix}gcc}"
+    if [ "$SYS" = MACOSX ] ; then
+      AS="${AS-extras/gas-preprocessor.pl $CC}"
+      ASFLAGS="$ASFLAGS -DPREFIX -DPIC"  # apple's ld doesn't support movw/movt relocations at all
+      # build for armv7 by default
+      if ! echo $CFLAGS | grep -Eq '\-arch' ; then
+        CFLAGS="$CFLAGS -arch armv7"
+        LDFLAGS="$LDFLAGS -arch armv7"
+      fi
+    else
+      AS="${AS-${cross_prefix}gcc}"
+    fi
     ;;
   s390|s390x)
     ARCH="S390"
@@ -427,10 +442,10 @@ if [ $asm = yes -a $ARCH = ARM ] ; then
     # set flags so neon is built by default
     echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu|-mfloat-abi)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp"
 
-    if  cc_check '' '' '__asm__("rev ip, ip");' ; then      define HAVE_ARMV6   && ASFLAGS="$ASFLAGS -DHAVE_ARMV6"
-        cc_check '' '' '__asm__("movt r0, #0");'         && define HAVE_ARMV6T2 && ASFLAGS="$ASFLAGS -DHAVE_ARMV6T2"
-        cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON    && ASFLAGS="$ASFLAGS -DHAVE_NEON"
-        ASFLAGS="$ASFLAGS -c"
+    if  cc_check '' '' '__asm__("rev ip, ip");' ; then      define HAVE_ARMV6
+        cc_check '' '' '__asm__("movt r0, #0");'         && define HAVE_ARMV6T2
+        cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON
+        ASFLAGS="$ASFLAGS $CFLAGS -c"
     else
         echo "You specified a pre-ARMv6 or Thumb-1 CPU in your CFLAGS."
         echo "If you really want to run on such a CPU, configure with --disable-asm."
diff --git a/doc/standards.txt b/doc/standards.txt
index db9a691..7474d8f 100644
--- a/doc/standards.txt
+++ b/doc/standards.txt
@@ -4,6 +4,7 @@ checkasm is written in gcc, with no attempt at compatibility with anything else.
 We make the following additional assumptions which are true of real systems but not guaranteed by C99:
 * Two's complement.
 * Signed right-shifts are sign-extended.
+* int is 32-bit or larger.
 
 x86-specific assumptions:
 * The stack is 16-byte aligned. We align it on entry to libx264 and on entry to any thread, but the compiler must preserve alignment after that.
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 666596b..1d48b7d 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -40,6 +40,7 @@ typedef struct
     int i_ref;
     int       i_rd16x16;
     x264_me_t me16x16;
+    x264_me_t bi16x16;      /* for b16x16 BI mode, since MVs can differ from l0/l1 */
 
     /* 8x8 */
     int       i_cost8x8;
@@ -361,8 +362,12 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 
     h->mb.i_me_method = h->param.analyse.i_me_method;
     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
+    if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
+        h->mb.i_subpel_refine--;
     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
                         && h->mb.i_subpel_refine >= 5;
+    h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
+                          (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
 
     h->mb.b_transform_8x8 = 0;
     h->mb.b_noise_reduction = 0;
@@ -1722,20 +1727,45 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
     a->l1.me16x16.i_ref = a->l1.i_ref;
 
     /* get cost of BI mode */
+    int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
+    h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
+    h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
     src0 = h->mc.get_ref( pix0, &stride0,
                           h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
-                          a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
+                          a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
     src1 = h->mc.get_ref( pix1, &stride1,
                           h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
-                          a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
+                          a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
 
     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
 
     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
-                     + REF_COST( 0, a->l0.i_ref )
-                     + REF_COST( 1, a->l1.i_ref )
-                     + a->l0.me16x16.cost_mv
-                     + a->l1.me16x16.cost_mv;
+                     + ref_costs
+                     + a->l0.bi16x16.cost_mv
+                     + a->l1.bi16x16.cost_mv;
+
+
+    /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
+    if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
+    {
+        int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
+                       + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
+        int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
+                       + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
+        h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
+                                h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
+                                h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
+        int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
+                   + ref_costs + l0_mv_cost + l1_mv_cost;
+        if( cost00 < a->i_cost16x16bi )
+        {
+            M32( a->l0.bi16x16.mv ) = 0;
+            M32( a->l1.bi16x16.mv ) = 0;
+            a->l0.bi16x16.cost_mv = l0_mv_cost;
+            a->l1.bi16x16.cost_mv = l1_mv_cost;
+            a->i_cost16x16bi = cost00;
+        }
+    }
 
     /* mb type cost */
     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
@@ -2205,7 +2235,7 @@ static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
     {
         case D_16x16:
             if( h->mb.i_type == B_BI_BI )
-                x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
+                x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
             break;
         case D_16x8:
             for( i=0; i<2; i++ )
@@ -2277,9 +2307,10 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
     int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
     int last_qp_tried = 0;
     origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
+    int origcbp = h->mb.cbp[h->mb.i_mb_xy];
 
     /* If CBP is already zero, don't raise the quantizer any higher. */
-    for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
+    for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
     {
         /* Without psy-RD, require monotonicity when moving quant away from previous
          * macroblock's quant; allow 1 failure when moving quant towards previous quant.
@@ -2294,14 +2325,47 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
         h->mb.i_qp = orig_qp;
         failures = 0;
         prevcost = origcost;
+
+        /* If the current QP results in an empty CBP, it's highly likely that lower QPs
+         * (up to a point) will too.  So, jump down to where the threshold will kick in
+         * and check the QP there.  If the CBP is still empty, skip the main loop.
+         * If it isn't empty, we would have ended up having to check this QP anyways,
+         * so as long as we store it for later lookup, we lose nothing. */
+        int already_checked_qp = -1;
+        int already_checked_cost = COST_MAX;
+        if( direction == -1 )
+        {
+            if( !origcbp )
+            {
+                h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
+                h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
+                already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
+                if( !h->mb.cbp[h->mb.i_mb_xy] )
+                {
+                    /* If our empty-CBP block is lower QP than the last QP,
+                     * the last QP almost surely doesn't have a CBP either. */
+                    if( h->mb.i_last_qp > h->mb.i_qp )
+                        last_qp_tried = 1;
+                    break;
+                }
+                already_checked_qp = h->mb.i_qp;
+                h->mb.i_qp = orig_qp;
+            }
+        }
+
         h->mb.i_qp += direction;
         while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
         {
             if( h->mb.i_last_qp == h->mb.i_qp )
                 last_qp_tried = 1;
-            h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
-            cost = x264_rd_cost_mb( h, a->i_lambda2 );
-            COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
+            if( h->mb.i_qp == already_checked_qp )
+                cost = already_checked_cost;
+            else
+            {
+                h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
+                cost = x264_rd_cost_mb( h, a->i_lambda2 );
+                COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
+            }
 
             /* We can't assume that the costs are monotonic over QPs.
              * Tie case-as-failure seems to give better results. */
@@ -2819,8 +2883,8 @@ intra_analysis:
                 }
                 else if( i_type == B_BI_BI )
                 {
-                    x264_me_refine_qpel( h, &analysis.l0.me16x16 );
-                    x264_me_refine_qpel( h, &analysis.l1.me16x16 );
+                    x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
+                    x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
                 }
             }
             else if( i_partition == D_16x8 )
@@ -2938,7 +3002,7 @@ intra_analysis:
                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
                     }
                     else if( i_type == B_BI_BI )
-                        x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
+                        x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
                 }
                 else if( i_partition == D_16x8 )
                 {
@@ -3121,10 +3185,10 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
                     break;
                 case B_BI_BI:
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
-                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
+                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
 
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
-                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
+                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
                     break;
                 }
                 break;
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index c65c9bd..85d2dde 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -147,10 +147,9 @@ static int block_residual_write_cavlc( x264_t *h, int i_ctxBlockCat, int16_t *l,
 
     if( i_trailing < i_total )
     {
-        int16_t val = runlevel.level[i_trailing];
-        int16_t val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
-        if( i_trailing < 3 )
-            val -= (val>>15)|1; /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
+        int val = runlevel.level[i_trailing];
+        int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
+        val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
         val += LEVEL_TABLE_SIZE/2;
 
         if( (unsigned)val_original < LEVEL_TABLE_SIZE )
diff --git a/encoder/encoder.c b/encoder/encoder.c
index d873cd0..df62389 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -84,7 +84,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
     x264_param_t *param = &h->param;
     int i;
 
-    /* First we fill all field */
+    /* First we fill all fields */
     sh->sps = sps;
     sh->pps = pps;
 
@@ -108,12 +108,24 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
 
     sh->i_redundant_pic_cnt = 0;
 
-    if( !h->mb.b_direct_auto_read )
+    h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
+                                && h->param.i_bframe
+                                && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
+
+    if( !h->mb.b_direct_auto_read && sh->i_type == SLICE_TYPE_B )
     {
-        if( h->mb.b_direct_auto_write )
-            sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] );
+        if( h->fref1[0]->i_poc_l0ref0 == h->fref0[0]->i_poc )
+        {
+            if( h->mb.b_direct_auto_write )
+                sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] );
+            else
+                sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
+        }
         else
-            sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
+        {
+            h->mb.b_direct_auto_write = 0;
+            sh->b_direct_spatial_mv_pred = 1;
+        }
     }
     /* else b_direct_spatial_mv_pred was read from the 2pass statsfile */
 
@@ -430,11 +442,6 @@ static int x264_validate_parameters( x264_t *h )
             x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" );
             h->param.analyse.i_me_method = X264_ME_UMH;
         }
-        if( h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL )
-        {
-            x264_log( h, X264_LOG_WARNING, "interlace + direct=temporal is not implemented\n" );
-            h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
-        }
         if( h->param.analyse.i_weighted_pred > 0 )
         {
             x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" );
@@ -507,6 +514,39 @@ static int x264_validate_parameters( x264_t *h )
     }
     h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 );
     h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
+    if( h->param.rc.i_vbv_buffer_size )
+    {
+        if( h->param.rc.i_rc_method == X264_RC_CQP )
+        {
+            x264_log( h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n" );
+            h->param.rc.i_vbv_max_bitrate = 0;
+            h->param.rc.i_vbv_buffer_size = 0;
+        }
+        else if( h->param.rc.i_vbv_max_bitrate == 0 )
+        {
+            if( h->param.rc.i_rc_method == X264_RC_ABR )
+            {
+                x264_log( h, X264_LOG_WARNING, "VBV maxrate unspecified, assuming CBR\n" );
+                h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+            }
+            else
+            {
+                x264_log( h, X264_LOG_WARNING, "VBV bufsize set but maxrate unspecified, ignored\n" );
+                h->param.rc.i_vbv_buffer_size = 0;
+            }
+        }
+        else if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
+                 h->param.rc.i_rc_method == X264_RC_ABR )
+        {
+            x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" );
+            h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+        }
+    }
+    else if( h->param.rc.i_vbv_max_bitrate )
+    {
+        x264_log( h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize, ignored\n" );
+        h->param.rc.i_vbv_max_bitrate = 0;
+    }
 
     int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
     if( h->param.b_sliced_threads )
@@ -566,8 +606,6 @@ static int x264_validate_parameters( x264_t *h )
         x264_log( h, X264_LOG_WARNING, "ref > 1 + intra-refresh is not supported\n" );
         h->param.i_frame_reference = 1;
     }
-    if( h->param.b_intra_refresh )
-        h->param.i_keyint_max = X264_MIN( h->param.i_keyint_max, (h->param.i_width+15)/16 - 1 );
     h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
     h->param.rc.i_lookahead = x264_clip3( h->param.rc.i_lookahead, 0, X264_LOOKAHEAD_MAX );
     {
@@ -597,10 +635,6 @@ static int x264_validate_parameters( x264_t *h )
     h->param.i_sync_lookahead = 0;
 #endif
 
-    h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
-                                && h->param.i_bframe
-                                && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
-
     h->param.i_deblocking_filter_alphac0 = x264_clip3( h->param.i_deblocking_filter_alphac0, -6, 6 );
     h->param.i_deblocking_filter_beta    = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 );
     h->param.analyse.i_luma_deadzone[0] = x264_clip3( h->param.analyse.i_luma_deadzone[0], 0, 32 );
@@ -659,8 +693,6 @@ static int x264_validate_parameters( x264_t *h )
     /* Psy trellis has a similar effect. */
     if( h->mb.i_psy_trellis )
         h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2;
-    else
-        h->mb.i_psy_trellis = 0;
     h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
     h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
     h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
@@ -1071,7 +1103,7 @@ fail:
  ****************************************************************************/
 int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
 {
-    h = h->thread[h->i_thread_phase];
+    h = h->thread[h->thread[0]->i_thread_phase];
     x264_set_aspect_ratio( h, param, 0 );
 #define COPY(var) h->param.var = param->var
     COPY( i_frame_reference ); // but never uses more refs than initially specified
@@ -1110,11 +1142,30 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
     COPY( i_slice_max_size );
     COPY( i_slice_max_mbs );
     COPY( i_slice_count );
+    /* VBV can't be turned on if it wasn't on to begin with */
+    if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 &&
+          param->rc.i_vbv_max_bitrate > 0 &&   param->rc.i_vbv_buffer_size > 0 )
+    {
+        COPY( rc.i_vbv_max_bitrate );
+        COPY( rc.i_vbv_buffer_size );
+        COPY( rc.i_bitrate );
+    }
+    COPY( rc.f_rf_constant );
 #undef COPY
 
     mbcmp_init( h );
 
-    return x264_validate_parameters( h );
+    int ret = x264_validate_parameters( h );
+
+    /* Supported reconfiguration options (1-pass only):
+     * vbv-maxrate
+     * vbv-bufsize
+     * crf
+     * bitrate (CBR only) */
+    if( !ret )
+        x264_ratecontrol_init_reconfigurable( h, 0 );
+
+    return ret;
 }
 
 /****************************************************************************
@@ -2010,6 +2061,8 @@ static int x264_threaded_slices_write( x264_t *h )
     for( i = 0; i <= h->sps->i_mb_height; i++ )
         x264_fdec_filter_row( h, i );
 
+    x264_threads_merge_ratecontrol( h );
+
     for( i = 1; i < h->param.i_threads; i++ )
     {
         x264_t *t = h->thread[i];
@@ -2025,8 +2078,6 @@ static int x264_threaded_slices_write( x264_t *h )
             ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
     }
 
-    x264_threads_merge_ratecontrol( h );
-
     return 0;
 }
 
@@ -2255,22 +2306,22 @@ int     x264_encoder_encode( x264_t *h,
     if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P )
     {
         int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
-        float increment = ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max;
+        float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
+        int max_position = (int)(increment * h->param.i_keyint_max);
         if( IS_X264_TYPE_I( h->fref0[0]->i_type ) )
             h->fdec->f_pir_position = 0;
         else
         {
-            if( h->fref0[0]->i_pir_end_col == h->sps->i_mb_width - 1 )
+            h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
+            if( h->fdec->f_pir_position+0.5 >= max_position )
             {
                 h->fdec->f_pir_position = 0;
                 h->fenc->b_keyframe = 1;
             }
-            else
-                h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
         }
         h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
         h->fdec->f_pir_position += increment * pocdiff;
-        h->fdec->i_pir_end_col = X264_MIN( h->fdec->f_pir_position+0.5, h->sps->i_mb_width-1 );
+        h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
     }
 
     /* Write SPS and PPS */
@@ -2306,8 +2357,9 @@ int     x264_encoder_encode( x264_t *h,
 
         if( h->fenc->i_type != X264_TYPE_IDR )
         {
+            int time_to_recovery = X264_MIN( h->sps->i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe;
             x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-            x264_sei_recovery_point_write( h, &h->out.bs, h->param.i_keyint_max );
+            x264_sei_recovery_point_write( h, &h->out.bs, time_to_recovery );
             x264_nal_end( h );
             overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
         }
@@ -2327,6 +2379,9 @@ int     x264_encoder_encode( x264_t *h,
         x264_reference_check_reorder( h );
     }
 
+    if( h->i_ref0 )
+        h->fdec->i_poc_l0ref0 = h->fref0[0]->i_poc;
+
     if( h->sh.i_type == SLICE_TYPE_B )
         x264_macroblock_bipred_init( h );
 
@@ -2762,7 +2817,8 @@ void    x264_encoder_close  ( x264_t *h )
             x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%%%s\n", 100. * i_i8x8 / i_intra, buf );
         }
 
-        if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
+        if( (h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO ||
+            (h->stat.i_direct_frames[0] && h->stat.i_direct_frames[1]))
             && h->stat.i_frame_count[SLICE_TYPE_B] )
         {
             x264_log( h, X264_LOG_INFO, "direct mvs  spatial:%.1f%% temporal:%.1f%%\n",
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index e4edb8a..f67a898 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -42,30 +42,24 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[4] )
     int d1 = dct[2] + dct[3]; \
     int d2 = dct[0] - dct[1]; \
     int d3 = dct[2] - dct[3]; \
-    int dmf = dequant_mf[i_qp%6][0]; \
-    int qbits = i_qp/6 - 5; \
-    if( qbits > 0 ) \
-    { \
-        dmf <<= qbits; \
-        qbits = 0; \
-    }
+    int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
 
 static inline void idct_dequant_2x2_dc( int16_t dct[4], int16_t dct4x4[4][16], int dequant_mf[6][16], int i_qp )
 {
     IDCT_DEQUANT_START
-    dct4x4[0][0] = (d0 + d1) * dmf >> -qbits;
-    dct4x4[1][0] = (d0 - d1) * dmf >> -qbits;
-    dct4x4[2][0] = (d2 + d3) * dmf >> -qbits;
-    dct4x4[3][0] = (d2 - d3) * dmf >> -qbits;
+    dct4x4[0][0] = (d0 + d1) * dmf >> 5;
+    dct4x4[1][0] = (d0 - d1) * dmf >> 5;
+    dct4x4[2][0] = (d2 + d3) * dmf >> 5;
+    dct4x4[3][0] = (d2 - d3) * dmf >> 5;
 }
 
 static inline void idct_dequant_2x2_dconly( int16_t out[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
 {
     IDCT_DEQUANT_START
-    out[0] = (d0 + d1) * dmf >> -qbits;
-    out[1] = (d0 - d1) * dmf >> -qbits;
-    out[2] = (d2 + d3) * dmf >> -qbits;
-    out[3] = (d2 - d3) * dmf >> -qbits;
+    out[0] = (d0 + d1) * dmf >> 5;
+    out[1] = (d0 - d1) * dmf >> 5;
+    out[2] = (d2 + d3) * dmf >> 5;
+    out[3] = (d2 - d3) * dmf >> 5;
 }
 
 static inline void dct2x2dc( int16_t d[4], int16_t dct4x4[4][16] )
@@ -208,8 +202,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
     ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[16] );
 
     int i, nz;
-    int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P);
-    int decimate_score = b_decimate ? 0 : 9;
+    int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
 
     if( h->mb.b_lossless )
     {
@@ -342,7 +335,7 @@ static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp,
 void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
 {
     int i, ch, nz, nz_dc;
-    int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
+    int b_decimate = b_inter && h->mb.b_dct_decimate;
     ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
     h->mb.i_cbp_chroma = 0;
 
@@ -607,7 +600,7 @@ void x264_macroblock_encode( x264_t *h )
 {
     int i_cbp_dc = 0;
     int i_qp = h->mb.i_qp;
-    int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
+    int b_decimate = h->mb.b_dct_decimate;
     int b_force_no_skip = 0;
     int i,idx,nz;
     h->mb.i_cbp_luma = 0;
@@ -914,8 +907,7 @@ void x264_macroblock_encode( x264_t *h )
 
 /*****************************************************************************
  * x264_macroblock_probe_skip:
- *  Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
- *  the previous QP
+ *  Check if the current MB could be encoded as a [PB]_SKIP
  *****************************************************************************/
 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
 {
@@ -1052,7 +1044,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
     int i_qp = h->mb.i_qp;
     uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
     uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
-    int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
+    int b_decimate = h->mb.b_dct_decimate;
     int nnz8x8 = 0;
     int ch, nz;
 
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 63b3be6..8c61582 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -134,9 +134,11 @@ struct x264_ratecontrol_t
                                  * This value is the current position (0 or 1). */
 
     /* MBRC stuff */
-    double frame_size_estimated;
+    float frame_size_estimated; /* Access to this variable must be atomic: double is
+                                 * not atomic on all arches we care about */
     double frame_size_planned;
     double slice_size_planned;
+    double max_frame_error;
     predictor_t (*row_pred)[2];
     predictor_t row_preds[5][2];
     predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
@@ -388,6 +390,53 @@ static char *x264_strcat_filename( char *input, char *suffix )
     return output;
 }
 
+void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init )
+{
+    x264_ratecontrol_t *rc = h->rc;
+    if( !b_init && rc->b_2pass )
+        return;
+
+    if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 )
+    {
+        if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
+        {
+            h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
+            x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
+                      h->param.rc.i_vbv_buffer_size );
+        }
+
+        /* We don't support changing the ABR bitrate right now,
+           so if the stream starts as CBR, keep it CBR. */
+        if( rc->b_vbv_min_rate )
+            h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+        rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
+        rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
+        rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
+        rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
+                      * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
+        if( b_init )
+        {
+            if( h->param.rc.f_vbv_buffer_init > 1. )
+                h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
+            h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
+            rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
+            rc->b_vbv = 1;
+            rc->b_vbv_min_rate = !rc->b_2pass
+                          && h->param.rc.i_rc_method == X264_RC_ABR
+                          && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
+        }
+    }
+    if( h->param.rc.i_rc_method == X264_RC_CRF )
+    {
+        /* Arbitrary rescaling to make CRF somewhat similar to QP.
+         * Try to compensate for MB-tree's effects as well. */
+        double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
+        double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
+        rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
+                                 / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
+    }
+}
+
 int x264_ratecontrol_new( x264_t *h )
 {
     x264_ratecontrol_t *rc;
@@ -426,60 +475,10 @@ int x264_ratecontrol_new( x264_t *h )
         x264_log(h, X264_LOG_ERROR, "constant rate-factor is incompatible with 2pass.\n");
         return -1;
     }
-    if( h->param.rc.i_vbv_buffer_size )
-    {
-        if( h->param.rc.i_rc_method == X264_RC_CQP )
-        {
-            x264_log(h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n");
-            h->param.rc.i_vbv_max_bitrate = 0;
-            h->param.rc.i_vbv_buffer_size = 0;
-        }
-        else if( h->param.rc.i_vbv_max_bitrate == 0 )
-        {
-            if( h->param.rc.i_rc_method == X264_RC_ABR )
-            {
-                x264_log( h, X264_LOG_INFO, "VBV maxrate unspecified, assuming CBR\n" );
-                h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
-            }
-            else
-            {
-                x264_log( h, X264_LOG_INFO, "VBV bufsize set but maxrate unspecified, ignored\n" );
-                h->param.rc.i_vbv_buffer_size = 0;
-            }
-        }
-    }
-    if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
-        h->param.rc.i_vbv_max_bitrate > 0)
-        x264_log(h, X264_LOG_WARNING, "max bitrate less than average bitrate, ignored.\n");
-    else if( h->param.rc.i_vbv_max_bitrate > 0 &&
-             h->param.rc.i_vbv_buffer_size > 0 )
-    {
-        if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
-        {
-            h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
-            x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
-                      h->param.rc.i_vbv_buffer_size );
-        }
-        if( h->param.rc.f_vbv_buffer_init > 1. )
-            h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
-        rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
-        rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
-        rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
-        h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
-        rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
-        rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
-                      * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
-        rc->b_vbv = 1;
-        rc->b_vbv_min_rate = !rc->b_2pass
-                          && h->param.rc.i_rc_method == X264_RC_ABR
-                          && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
-    }
-    else if( h->param.rc.i_vbv_max_bitrate )
-    {
-        x264_log(h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize.\n");
-        h->param.rc.i_vbv_max_bitrate = 0;
-    }
-    if(rc->rate_tolerance < 0.01)
+
+    x264_ratecontrol_init_reconfigurable( h, 1 );
+
+    if( rc->rate_tolerance < 0.01 )
     {
         x264_log(h, X264_LOG_WARNING, "bitrate tolerance too small, using .01\n");
         rc->rate_tolerance = 0.01;
@@ -499,16 +498,6 @@ int x264_ratecontrol_new( x264_t *h )
         rc->last_non_b_pict_type = SLICE_TYPE_I;
     }
 
-    if( h->param.rc.i_rc_method == X264_RC_CRF )
-    {
-        /* Arbitrary rescaling to make CRF somewhat similar to QP.
-         * Try to compensate for MB-tree's effects as well. */
-        double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
-        double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
-        rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
-                                 / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
-    }
-
     rc->ip_offset = 6.0 * log(h->param.rc.f_ip_factor) / log(2.0);
     rc->pb_offset = 6.0 * log(h->param.rc.f_pb_factor) / log(2.0);
     rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
@@ -518,17 +507,21 @@ int x264_ratecontrol_new( x264_t *h )
 
     rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
     rc->last_qscale = qp2qscale(26);
-    CHECKED_MALLOC( rc->pred, 5*sizeof(predictor_t) );
+    int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1;
+    CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds );
     CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
     for( i = 0; i < 5; i++ )
     {
         rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
         rc->lmin[i] = qp2qscale( h->param.rc.i_qp_min );
         rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max );
-        rc->pred[i].coeff= 2.0;
-        rc->pred[i].count= 1.0;
-        rc->pred[i].decay= 0.5;
-        rc->pred[i].offset= 0.0;
+        for( j = 0; j < num_preds; j++ )
+        {
+            rc->pred[i+j*5].coeff= 2.0;
+            rc->pred[i+j*5].count= 1.0;
+            rc->pred[i+j*5].decay= 0.5;
+            rc->pred[i+j*5].offset= 0.0;
+        }
         for( j = 0; j < 2; j++ )
         {
             rc->row_preds[i][j].coeff= .25;
@@ -999,22 +992,6 @@ void x264_ratecontrol_delete( x264_t *h )
     x264_free( rc );
 }
 
-void x264_ratecontrol_set_estimated_size( x264_t *h, int bits )
-{
-    x264_pthread_mutex_lock( &h->fenc->mutex );
-    h->rc->frame_size_estimated = bits;
-    x264_pthread_mutex_unlock( &h->fenc->mutex );
-}
-
-int x264_ratecontrol_get_estimated_size( x264_t const *h)
-{
-    int size;
-    x264_pthread_mutex_lock( &h->fenc->mutex );
-    size = h->rc->frame_size_estimated;
-    x264_pthread_mutex_unlock( &h->fenc->mutex );
-    return size;
-}
-
 static void accum_p_qp_update( x264_t *h, float qp )
 {
     x264_ratecontrol_t *rc = h->rc;
@@ -1186,6 +1163,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
     /* tweak quality based on difference from predicted size */
     if( y < h->i_threadslice_end-1 )
     {
+        int i;
         int prev_row_qp = h->fdec->i_row_qp[y];
         int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
         int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
@@ -1199,19 +1177,23 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
 
         float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
         float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
-        float size_of_other_slices = rc->frame_size_planned - slice_size_planned;
+        float size_of_other_slices = 0;
+        if( h->param.b_sliced_threads )
+        {
+            for( i = 0; i < h->param.i_threads; i++ )
+                if( h != h->thread[i] )
+                    size_of_other_slices += h->thread[i]->rc->frame_size_estimated;
+        }
+        else
+            rc->max_frame_error = X264_MAX( 0.05, 1.0 / (h->sps->i_mb_width) );
+
         /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
         float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
-        float max_frame_error = X264_MAX( 0.05, 1.0 / h->sps->i_mb_height );
-        int b1 = predict_row_size_sum( h, y, rc->qpm );
-
-        /* Assume that if this slice has become larger than expected,
-         * the other slices will have gotten equally larger. */
-        b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+        int b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
 
         /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
         /* area at the top of the frame was measured inaccurately. */
-        if( row_bits_so_far(h,y) < 0.05 * (rc->frame_size_planned-size_of_other_slices) )
+        if( row_bits_so_far( h, y ) < 0.05 * slice_size_planned )
             return;
 
         if( h->sh.i_type != SLICE_TYPE_I )
@@ -1226,8 +1208,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
                    (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) )
         {
             rc->qpm ++;
-            b1 = predict_row_size_sum( h, y, rc->qpm );
-            b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+            b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
         }
 
         while( rc->qpm > i_qp_min
@@ -1236,20 +1217,18 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
                || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) )
         {
             rc->qpm --;
-            b1 = predict_row_size_sum( h, y, rc->qpm );
-            b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+            b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
         }
 
         /* avoid VBV underflow */
         while( (rc->qpm < h->param.rc.i_qp_max)
-               && (rc->buffer_fill - b1 < rc->buffer_rate * max_frame_error) )
+               && (rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) )
         {
             rc->qpm ++;
-            b1 = predict_row_size_sum( h, y, rc->qpm );
-            b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+            b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
         }
 
-        x264_ratecontrol_set_estimated_size(h, b1);
+        h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm );
     }
 
     /* loses the fractional part of the frame-wise qp */
@@ -1293,6 +1272,7 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
                 h->thread[i]->param.rc.b_stat_read = 0;
                 h->thread[i]->param.i_bframe_adaptive = 0;
                 h->thread[i]->param.i_scenecut_threshold = 0;
+                h->thread[i]->param.rc.b_mb_tree = 0;
                 if( h->thread[i]->param.i_bframe > 1 )
                     h->thread[i]->param.i_bframe = 1;
             }
@@ -1577,15 +1557,15 @@ static void update_vbv( x264_t *h, int bits )
     if( rct->buffer_fill_final < 0 )
         x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, rct->buffer_fill_final );
     rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 );
-    rct->buffer_fill_final += rct->buffer_rate;
-    rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rct->buffer_size );
+    rct->buffer_fill_final += rcc->buffer_rate;
+    rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rcc->buffer_size );
 }
 
 // provisionally update VBV according to the planned size of all frames currently in progress
 static void update_vbv_plan( x264_t *h, int overhead )
 {
     x264_ratecontrol_t *rcc = h->rc;
-    rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final - overhead;
+    rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final;
     if( h->i_thread_frames > 1 )
     {
         int j = h->rc - h->thread[0]->rc;
@@ -1596,13 +1576,15 @@ static void update_vbv_plan( x264_t *h, int overhead )
             double bits = t->rc->frame_size_planned;
             if( !t->b_thread_active )
                 continue;
-            bits  = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
+            bits  = X264_MAX(bits, t->rc->frame_size_estimated);
             rcc->buffer_fill -= bits;
             rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 );
             rcc->buffer_fill += rcc->buffer_rate;
             rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
         }
     }
+    rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
+    rcc->buffer_fill -= overhead;
 }
 
 // apply VBV constraints and clip qscale to between lmin and lmax
@@ -1793,7 +1775,7 @@ static float rate_estimate_qscale( x264_t *h )
             rcc->frame_size_planned = qscale2bits( &rce, q );
         else
             rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref1[h->i_ref1-1]->i_satd );
-        x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
+        h->rc->frame_size_estimated = rcc->frame_size_planned;
 
         /* For row SATDs */
         if( rcc->b_vbv )
@@ -1802,13 +1784,15 @@ static float rate_estimate_qscale( x264_t *h )
     }
     else
     {
-        double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate * h->i_thread_frames;
+        double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate;
 
         if( rcc->b_2pass )
         {
-            //FIXME adjust abr_buffer based on distance to the end of the video
             int64_t diff;
             int64_t predicted_bits = total_bits;
+            /* Adjust ABR buffer based on distance to the end of the video. */
+            if( rcc->num_entries > h->fenc->i_frame )
+                abr_buffer *= 0.5 * sqrt( rcc->num_entries - h->fenc->i_frame );
 
             if( rcc->b_vbv )
             {
@@ -1822,7 +1806,7 @@ static float rate_estimate_qscale( x264_t *h )
                         double bits = t->rc->frame_size_planned;
                         if( !t->b_thread_active )
                             continue;
-                        bits  = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
+                        bits  = X264_MAX(bits, t->rc->frame_size_estimated);
                         predicted_bits += (int64_t)bits;
                     }
                 }
@@ -1963,61 +1947,96 @@ static float rate_estimate_qscale( x264_t *h )
         /* Always use up the whole VBV in this case. */
         if( rcc->single_frame_vbv )
             rcc->frame_size_planned = rcc->buffer_rate;
-        x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
+        h->rc->frame_size_estimated = rcc->frame_size_planned;
         return q;
     }
 }
 
+void x264_threads_normalize_predictors( x264_t *h )
+{
+    int i;
+    double totalsize = 0;
+    for( i = 0; i < h->param.i_threads; i++ )
+        totalsize += h->thread[i]->rc->slice_size_planned;
+    double factor = h->rc->frame_size_planned / totalsize;
+    for( i = 0; i < h->param.i_threads; i++ )
+        h->thread[i]->rc->slice_size_planned *= factor;
+}
+
 void x264_threads_distribute_ratecontrol( x264_t *h )
 {
-    int i, row, totalsize = 0;
-    if( h->rc->b_vbv )
-        for( row = 0; row < h->sps->i_mb_height; row++ )
-            totalsize += h->fdec->i_row_satd[row];
+    int i, row;
+    x264_ratecontrol_t *rc = h->rc;
+
+    /* Initialize row predictors */
+    if( h->i_frame == 0 )
+        for( i = 0; i < h->param.i_threads; i++ )
+        {
+            x264_ratecontrol_t *t = h->thread[i]->rc;
+            memcpy( t->row_preds, rc->row_preds, sizeof(rc->row_preds) );
+        }
+
     for( i = 0; i < h->param.i_threads; i++ )
     {
         x264_t *t = h->thread[i];
-        x264_ratecontrol_t *rc = h->rc;
-        memcpy( t->rc, rc, sizeof(x264_ratecontrol_t) );
+        memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
+        t->rc->row_pred = &t->rc->row_preds[h->sh.i_type];
         /* Calculate the planned slice size. */
-        if( h->rc->b_vbv && rc->frame_size_planned )
+        if( rc->b_vbv && rc->frame_size_planned )
         {
             int size = 0;
             for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
                 size += h->fdec->i_row_satd[row];
-            t->rc->slice_size_planned = size * rc->frame_size_planned / totalsize;
+            t->rc->slice_size_planned = predict_size( &rc->pred[h->sh.i_type + (i+1)*5], rc->qpm, size );
         }
         else
             t->rc->slice_size_planned = 0;
     }
+    if( rc->b_vbv && rc->frame_size_planned )
+    {
+        x264_threads_normalize_predictors( h );
+
+        if( rc->single_frame_vbv )
+        {
+            /* Compensate for our max frame error threshold: give more bits (proportionally) to smaller slices. */
+            for( i = 0; i < h->param.i_threads; i++ )
+            {
+                x264_t *t = h->thread[i];
+                t->rc->max_frame_error = X264_MAX( 0.05, 1.0 / (t->i_threadslice_end - t->i_threadslice_start) );
+                t->rc->slice_size_planned += 2 * t->rc->max_frame_error * rc->frame_size_planned;
+            }
+            x264_threads_normalize_predictors( h );
+        }
+
+        for( i = 0; i < h->param.i_threads; i++ )
+            h->thread[i]->rc->frame_size_estimated = h->thread[i]->rc->slice_size_planned;
+    }
 }
 
 void x264_threads_merge_ratecontrol( x264_t *h )
 {
-    int i, j, k;
+    int i, row;
     x264_ratecontrol_t *rc = h->rc;
     x264_emms();
 
-    for( i = 1; i < h->param.i_threads; i++ )
+    for( i = 0; i < h->param.i_threads; i++ )
     {
-        x264_ratecontrol_t *t = h->thread[i]->rc;
-        rc->qpa_rc += t->qpa_rc;
-        rc->qpa_aq += t->qpa_aq;
-        for( j = 0; j < 5; j++ )
-            for( k = 0; k < 2; k++ )
-            {
-                rc->row_preds[j][k].coeff += t->row_preds[j][k].coeff;
-                rc->row_preds[j][k].offset += t->row_preds[j][k].offset;
-                rc->row_preds[j][k].count += t->row_preds[j][k].count;
-            }
-    }
-    for( j = 0; j < 5; j++ )
-        for( k = 0; k < 2; k++ )
+        x264_t *t = h->thread[i];
+        x264_ratecontrol_t *rct = h->thread[i]->rc;
+        if( h->param.rc.i_vbv_buffer_size )
         {
-            rc->row_preds[j][k].coeff /= h->param.i_threads;
-            rc->row_preds[j][k].offset /= h->param.i_threads;
-            rc->row_preds[j][k].count /= h->param.i_threads;
+            int size = 0;
+            for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
+                size += h->fdec->i_row_satd[row];
+            int bits = t->stat.frame.i_mv_bits + t->stat.frame.i_tex_bits + t->stat.frame.i_misc_bits;
+            int mb_count = (t->i_threadslice_end - t->i_threadslice_start) * h->sps->i_mb_width;
+            update_predictor( &rc->pred[h->sh.i_type+5*i], qp2qscale(rct->qpa_rc/mb_count), size, bits );
         }
+        if( !i )
+            continue;
+        rc->qpa_rc += rct->qpa_rc;
+        rc->qpa_aq += rct->qpa_aq;
+    }
 }
 
 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
@@ -2027,8 +2046,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
 #define COPY(var) memcpy(&cur->rc->var, &prev->rc->var, sizeof(cur->rc->var))
         /* these vars are updated in x264_ratecontrol_start()
          * so copy them from the context that most recently started (prev)
-         * to the context that's about to start (cur).
-         */
+         * to the context that's about to start (cur). */
         COPY(accum_p_qp);
         COPY(accum_p_norm);
         COPY(last_satd);
@@ -2040,6 +2058,14 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
         COPY(bframes);
         COPY(prev_zone);
         COPY(qpbuf_pos);
+        /* these vars can be updated by x264_ratecontrol_init_reconfigurable */
+        COPY(buffer_rate);
+        COPY(buffer_size);
+        COPY(single_frame_vbv);
+        COPY(cbr_decay);
+        COPY(b_vbv_min_rate);
+        COPY(rate_factor_constant);
+        COPY(bitrate);
 #undef COPY
     }
     if( cur != next )
@@ -2047,8 +2073,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
 #define COPY(var) next->rc->var = cur->rc->var
         /* these vars are updated in x264_ratecontrol_end()
          * so copy them from the context that most recently ended (cur)
-         * to the context that's about to end (next)
-         */
+         * to the context that's about to end (next) */
         COPY(cplxr_sum);
         COPY(expected_bits_sum);
         COPY(wanted_bits_window);
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
index 5a8d088..2767866 100644
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@@ -27,6 +27,8 @@
 int  x264_ratecontrol_new   ( x264_t * );
 void x264_ratecontrol_delete( x264_t * );
 
+void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
+
 void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
 void x264_adaptive_quant( x264_t * );
 int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 057f6a6..bb2ed64 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -1394,10 +1394,10 @@ int x264_rc_analyse_slice( x264_t *h )
             int mb_xy = y * h->mb.i_mb_stride;
             for( x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
             {
-                int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor) >> 8;
+                int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
                 int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
                 int diff = intra_cost - inter_cost;
-                h->fdec->i_row_satd[y] += diff;
+                h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
                 cost += diff;
             }
         }
diff --git a/extras/gas-preprocessor.pl b/extras/gas-preprocessor.pl
new file mode 100755
index 0000000..d60893c
--- /dev/null
+++ b/extras/gas-preprocessor.pl
@@ -0,0 +1,256 @@
+#!/usr/bin/env perl
+# by David Conrad
+# This code is licensed under GPLv2 or later; go to gnu.org to read it
+#  (not that it much matters for an asm preprocessor)
+# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
+use strict;
+
+# Apple's gas is ancient and doesn't support modern preprocessing features like
+# .rept and has ugly macro syntax, among other things. Thus, this script
+# implements the subset of the gas preprocessor used by x264 and ffmpeg
+# that isn't supported by Apple's gas.
+
+# FIXME: doesn't work if the path has spaces, but oh well...
+my $gcc_cmd = join(' ', @ARGV);
+my $preprocess_c_cmd;
+
+if ($gcc_cmd =~ /\S+\.c/) {
+    # C file (inline asm?) - compile
+    $preprocess_c_cmd = "$gcc_cmd -S";
+    $gcc_cmd =~ s/\S+\.c/-x assembler -/g;
+} elsif ($gcc_cmd =~ /\S+\.S/) {
+    # asm file, just do C preprocessor
+    $preprocess_c_cmd = "$gcc_cmd -E";
+    $gcc_cmd =~ s/\S+\.S/-x assembler -/g;
+} else {
+    die "Unrecognized input filetype";
+}
+
+$preprocess_c_cmd =~ s/\S+\.o/-/g;
+
+open(ASMFILE, "-|", $preprocess_c_cmd) || die "Error running preprocessor";
+
+my $current_macro = '';
+my %macro_lines;
+my %macro_args;
+my %macro_args_default;
+
+my @pass1_lines;
+
+# pass 1: parse .macro
+# note that the handling of arguments is probably overly permissive vs. gas
+# but it should be the same for valid cases
+while (<ASMFILE>) {
+    # comment out unsupported directives
+    s/\.type/@.type/x;
+    s/\.func/@.func/x;
+    s/\.endfunc/@.endfunc/x;
+    s/\.ltorg/@.ltorg/x;
+    s/\.size/@.size/x;
+    s/\.fpu/@.fpu/x;
+
+    # the syntax for these is a little different
+    s/\.global/.globl/x;
+    # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const
+    s/(.*)\.rodata/.const_data/x;
+    s/\.int/.long/x;
+    s/\.float/.single/x;
+
+    # catch unknown section names that aren't mach-o style (with a comma)
+    if (/.section ([^,]*)$/) {
+        die ".section $1 unsupported; figure out the mach-o section name and add it";
+    }
+
+    # macros creating macros is not handled (is that valid?)
+    if (/\.macro\s+([\d\w\.]+)\s*(.*)/) {
+        $current_macro = $1;
+
+        # commas in the argument list are optional, so only use whitespace as the separator
+        my $arglist = $2;
+        $arglist =~ s/,/ /g;
+
+        my @args = split(/\s+/, $arglist);
+        foreach my $i (0 .. $#args) {
+            my @argpair = split(/=/, $args[$i]);
+            $macro_args{$current_macro}[$i] = $argpair[0];
+            $argpair[0] =~ s/:vararg$//;
+            $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1];
+        }
+        # ensure %macro_lines has the macro name added as a key
+        $macro_lines{$current_macro} = [];
+    } elsif (/\.endm/) {
+        if (!$current_macro) {
+            die "ERROR: .endm without .macro";
+        }
+        $current_macro = '';
+    } elsif ($current_macro) {
+        push(@{$macro_lines{$current_macro}}, $_);
+    } else {
+        expand_macros($_);
+    }
+}
+
+sub expand_macros {
+    my $line = @_[0];
+    if ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) {
+        push(@pass1_lines, $1);
+        my $macro = $2;
+
+        # commas are optional here too, but are syntactically important because
+        # parameters can be blank
+        my @arglist = split(/,/, $3);
+        my @args;
+        foreach (@arglist) {
+            my @whitespace_split = split(/\s+/, $_);
+            if (!@whitespace_split) {
+                push(@args, '');
+            } else {
+                foreach (@whitespace_split) {
+                    if (length($_)) {
+                        push(@args, $_);
+                    }
+                }
+            }
+        }
+
+        my %replacements;
+        if ($macro_args_default{$macro}){
+            %replacements = %{$macro_args_default{$macro}};
+        }
+
+        # construct hashtable of text to replace
+        foreach my $i (0 .. $#args) {
+            my $argname = $macro_args{$macro}[$i];
+
+            if ($args[$i] =~ m/=/) {
+                # arg=val references the argument name
+                # XXX: I'm not sure what the expected behaviour if a lot of
+                # these are mixed with unnamed args
+                my @named_arg = split(/=/, $args[$i]);
+                $replacements{$named_arg[0]} = $named_arg[1];
+            } elsif ($i > $#{$macro_args{$macro}}) {
+                # more args given than the macro has named args
+                # XXX: is vararg allowed on arguments before the last?
+                $argname = $macro_args{$macro}[-1];
+                if ($argname =~ s/:vararg$//) {
+                    $replacements{$argname} .= ", $args[$i]";
+                } else {
+                    die "Too many arguments to macro $macro";
+                }
+            } else {
+                $argname =~ s/:vararg$//;
+                $replacements{$argname} = $args[$i];
+            }
+        }
+
+        # apply replacements as regex
+        foreach (@{$macro_lines{$macro}}) {
+            my $macro_line = $_;
+            # do replacements by longest first, this avoids wrong replacement
+            # when argument names are subsets of each other
+            foreach (reverse sort {length $a <=> length $b} keys %replacements) {
+                $macro_line =~ s/\\$_/$replacements{$_}/g;
+            }
+            $macro_line =~ s/\\\(\)//g;     # remove \()
+            expand_macros($macro_line);
+        }
+    } else {
+        push(@pass1_lines, $line);
+    }
+}
+
+close(ASMFILE) or exit 1;
+open(ASMFILE, "|-", $gcc_cmd) or die "Error running assembler";
+
+my @sections;
+my $num_repts;
+my $rept_lines;
+
+my %literal_labels;     # for ldr <reg>, =<expr>
+my $literal_num = 0;
+
+# pass 2: parse .rept and .if variants
+# NOTE: since we don't implement a proper parser, using .rept with a
+# variable assigned from .set is not supported
+foreach my $line (@pass1_lines) {
+    # textual comparison .if
+    # this assumes nothing else on the same line
+    if ($line =~ /\.ifnb\s+(.*)/) {
+        if ($1) {
+            $line = ".if 1\n";
+        } else {
+            $line = ".if 0\n";
+        }
+    } elsif ($line =~ /\.ifb\s+(.*)/) {
+        if ($1) {
+            $line = ".if 0\n";
+        } else {
+            $line = ".if 1\n";
+        }
+    } elsif ($line =~ /\.ifc\s+(.*)\s*,\s*(.*)/) {
+        if ($1 eq $2) {
+            $line = ".if 1\n";
+        } else {
+            $line = ".if 0\n";
+        }
+    }
+
+    # handle .previous (only with regard to .section not .subsection)
+    if ($line =~ /\.(section|text|const_data)/) {
+        push(@sections, $line);
+    } elsif ($line =~ /\.previous/) {
+        if (!$sections[-2]) {
+            die ".previous without a previous section";
+        }
+        $line = $sections[-2];
+        push(@sections, $line);
+    }
+
+    # handle ldr <reg>, =<expr>
+    if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/) {
+        my $label = $literal_labels{$3};
+        if (!$label) {
+            $label = ".Literal_$literal_num";
+            $literal_num++;
+            $literal_labels{$3} = $label;
+        }
+        $line = "$1 ldr$2, $label\n";
+    } elsif ($line =~ /\.ltorg/) {
+        foreach my $literal (keys %literal_labels) {
+            $line .= "$literal_labels{$literal}:\n .word $literal\n";
+        }
+        %literal_labels = ();
+    }
+
+    # @l -> lo16()  @ha -> ha16()
+    $line =~ s/,\s+([^,]+)\@l(\s)/, lo16($1)$2/g;
+    $line =~ s/,\s+([^,]+)\@ha(\s)/, ha16($1)$2/g;
+
+    if ($line =~ /\.rept\s+(.*)/) {
+        $num_repts = $1;
+        $rept_lines = "\n";
+
+        # handle the possibility of repeating another directive on the same line
+        # .endr on the same line is not valid, I don't know if a non-directive is
+        if ($num_repts =~ s/(\.\w+.*)//) {
+            $rept_lines .= "$1\n";
+        }
+        $num_repts = eval($num_repts);
+    } elsif ($line =~ /\.endr/) {
+        for (1 .. $num_repts) {
+            print ASMFILE $rept_lines;
+        }
+        $rept_lines = '';
+    } elsif ($rept_lines) {
+        $rept_lines .= $line;
+    } else {
+        print ASMFILE $line;
+    }
+}
+
+print ASMFILE ".text\n";
+foreach my $literal (keys %literal_labels) {
+    print ASMFILE "$literal_labels{$literal}:\n .word $literal\n";
+}
+
+close(ASMFILE) or exit 1;
diff --git a/input/avs.c b/input/avs.c
index 522f8fe..79b5c80 100644
--- a/input/avs.c
+++ b/input/avs.c
@@ -313,4 +313,4 @@ static int close_file( hnd_t handle )
     return 0;
 }
 
-cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
+const cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
diff --git a/input/ffms.c b/input/ffms.c
index b680967..14962c7 100644
--- a/input/ffms.c
+++ b/input/ffms.c
@@ -244,4 +244,4 @@ static int close_file( hnd_t handle )
     return 0;
 }
 
-cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
+const cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/input/input.h b/input/input.h
index 9fb425c..6e386f4 100644
--- a/input/input.h
+++ b/input/input.h
@@ -60,11 +60,11 @@ typedef struct
     int (*close_file)( hnd_t handle );
 } cli_input_t;
 
-extern cli_input_t yuv_input;
-extern cli_input_t y4m_input;
-extern cli_input_t avs_input;
+extern const cli_input_t yuv_input;
+extern const cli_input_t y4m_input;
+extern const cli_input_t avs_input;
 extern cli_input_t thread_input;
-extern cli_input_t lavf_input;
-extern cli_input_t ffms_input;
+extern const cli_input_t lavf_input;
+extern const cli_input_t ffms_input;
 
 #endif
diff --git a/input/lavf.c b/input/lavf.c
index 180e509..6ecc6b0 100644
--- a/input/lavf.c
+++ b/input/lavf.c
@@ -269,4 +269,4 @@ static int close_file( hnd_t handle )
     return 0;
 }
 
-cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
+const cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
diff --git a/input/y4m.c b/input/y4m.c
index 1619f74..8645ff7 100644
--- a/input/y4m.c
+++ b/input/y4m.c
@@ -242,4 +242,4 @@ static int close_file( hnd_t handle )
     return 0;
 }
 
-cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
+const cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/input/yuv.c b/input/yuv.c
index dbd0317..3e39e07 100644
--- a/input/yuv.c
+++ b/input/yuv.c
@@ -125,4 +125,4 @@ static int close_file( hnd_t handle )
     return 0;
 }
 
-cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
+const cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/output/flv.c b/output/flv.c
index b3e5d16..2e0a0e4 100644
--- a/output/flv.c
+++ b/output/flv.c
@@ -305,4 +305,4 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
     return 0;
 }
 
-cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
+const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/output/matroska.c b/output/matroska.c
index 8e84f52..fb39ced 100644
--- a/output/matroska.c
+++ b/output/matroska.c
@@ -146,7 +146,7 @@ static int write_headers( hnd_t handle, x264_nal_t *p_nal )
 
     memcpy( avcC+11+sps_size, pps, pps_size );
 
-    ret = mk_writeHeader( p_mkv->w, "x264", "V_MPEG4/ISO/AVC",
+    ret = mk_writeHeader( p_mkv->w, "x264" X264_VERSION, "V_MPEG4/ISO/AVC",
                           avcC, avcC_len, p_mkv->frame_duration, 50000,
                           p_mkv->width, p_mkv->height,
                           p_mkv->d_width, p_mkv->d_height );
@@ -185,7 +185,7 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
 
     p_mkv->b_writing_frame = 0;
 
-    if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe ) < 0 )
+    if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe, p_picture->i_type == X264_TYPE_B ) < 0 )
         return -1;
 
     return i_size;
@@ -206,4 +206,4 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
     return ret;
 }
 
-cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
+const cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
index d1c6e13..7265909 100644
--- a/output/matroska_ebml.c
+++ b/output/matroska_ebml.c
@@ -53,9 +53,9 @@ struct mk_writer
     int64_t def_duration;
     int64_t timescale;
     int64_t cluster_tc_scaled;
-    int64_t frame_tc, prev_frame_tc_scaled, max_frame_tc;
+    int64_t frame_tc, max_frame_tc;
 
-    char wrote_header, in_frame, keyframe;
+    char wrote_header, in_frame, keyframe, skippable;
 };
 
 static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id )
@@ -258,23 +258,6 @@ static int mk_write_uint( mk_context *c, unsigned id, int64_t ui )
     return 0;
 }
 
-static int mk_write_sint( mk_context *c, unsigned id, int64_t si )
-{
-    unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
-    unsigned i = 0;
-
-    CHECK( mk_write_id( c, id ) );
-    if( si < 0 )
-        while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
-            ++i;
-    else
-        while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80 ) )
-            ++i;
-    CHECK( mk_write_size( c, 8 - i ) );
-    CHECK( mk_append_context_data( c, c_si+i, 8 - i ) );
-    return 0;
-}
-
 static int mk_write_float_raw( mk_context *c, float f )
 {
     union
@@ -301,34 +284,6 @@ static int mk_write_float( mk_context *c, unsigned id, float f )
     return 0;
 }
 
-static unsigned mk_ebml_size_size( unsigned s )
-{
-    if( s < 0x7f )
-        return 1;
-    if( s < 0x3fff )
-        return 2;
-    if( s < 0x1fffff )
-        return 3;
-    if( s < 0x0fffffff )
-        return 4;
-    return 5;
-}
-
-static unsigned mk_ebml_sint_size( int64_t si )
-{
-    unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
-    unsigned i = 0;
-
-    if( si < 0 )
-        while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
-            ++i;
-    else
-        while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80) )
-            ++i;
-
-    return 8 - i;
-}
-
 mk_writer *mk_create_writer( const char *filename )
 {
     mk_writer *w = malloc( sizeof(*w) );
@@ -446,8 +401,8 @@ static int mk_close_cluster( mk_writer *w )
 
 static int mk_flush_frame( mk_writer *w )
 {
-    int64_t delta, ref = 0;
-    unsigned fsize, bgsize;
+    int64_t delta;
+    unsigned fsize;
     unsigned char c_delta_flags[3];
 
     if( !w->in_frame )
@@ -470,33 +425,22 @@ static int mk_flush_frame( mk_writer *w )
     }
 
     fsize = w->frame ? w->frame->d_cur : 0;
-    bgsize = fsize + 4 + mk_ebml_size_size( fsize + 4 ) + 1;
-    if( !w->keyframe )
-    {
-        ref = w->prev_frame_tc_scaled - w->cluster_tc_scaled - delta;
-        bgsize += 1 + 1 + mk_ebml_sint_size( ref );
-    }
 
-    CHECK( mk_write_id( w->cluster, 0xa0 ) ); // BlockGroup
-    CHECK( mk_write_size( w->cluster, bgsize ) );
-    CHECK( mk_write_id( w->cluster, 0xa1 ) ); // Block
+    CHECK( mk_write_id( w->cluster, 0xa3 ) ); // SimpleBlock
     CHECK( mk_write_size( w->cluster, fsize + 4 ) );
     CHECK( mk_write_size( w->cluster, 1 ) ); // track number
 
     c_delta_flags[0] = delta >> 8;
     c_delta_flags[1] = delta;
-    c_delta_flags[2] = 0;
+    c_delta_flags[2] = (w->keyframe << 7) | w->skippable;
     CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) );
     if( w->frame )
     {
         CHECK( mk_append_context_data( w->cluster, w->frame->data, w->frame->d_cur ) );
         w->frame->d_cur = 0;
     }
-    if( !w->keyframe )
-        CHECK( mk_write_sint( w->cluster, 0xfb, ref ) ); // ReferenceBlock
 
     w->in_frame = 0;
-    w->prev_frame_tc_scaled = w->cluster_tc_scaled + delta;
 
     if( w->cluster->d_cur > CLSIZE )
         CHECK( mk_close_cluster( w ) );
@@ -509,19 +453,21 @@ int mk_start_frame( mk_writer *w )
     if( mk_flush_frame( w ) < 0 )
         return -1;
 
-    w->in_frame = 1;
-    w->keyframe = 0;
+    w->in_frame  = 1;
+    w->keyframe  = 0;
+    w->skippable = 0;
 
     return 0;
 }
 
-int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe )
+int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable )
 {
     if( !w->in_frame )
         return -1;
 
-    w->frame_tc = timestamp;
-    w->keyframe = keyframe != 0;
+    w->frame_tc  = timestamp;
+    w->keyframe  = keyframe  != 0;
+    w->skippable = skippable != 0;
 
     if( w->max_frame_tc < timestamp )
         w->max_frame_tc = timestamp;
diff --git a/output/matroska_ebml.h b/output/matroska_ebml.h
index 252e781..56eb8cc 100644
--- a/output/matroska_ebml.h
+++ b/output/matroska_ebml.h
@@ -35,7 +35,7 @@ int mk_writeHeader( mk_writer *w, const char *writing_app,
 
 int mk_start_frame( mk_writer *w );
 int mk_add_frame_data( mk_writer *w, const void *data, unsigned size );
-int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe );
+int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable );
 int mk_close( mk_writer *w, int64_t last_delta );
 
 #endif
diff --git a/output/mp4.c b/output/mp4.c
index e3ad9c6..b99eaed 100644
--- a/output/mp4.c
+++ b/output/mp4.c
@@ -121,7 +121,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
         if( mdhd_duration != total_duration )
         {
             uint64_t last_dts = gf_isom_get_sample_dts( p_mp4->p_file, p_mp4->i_track, p_mp4->i_numframe );
-            uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc  );
+            uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
             gf_isom_set_last_sample_duration( p_mp4->p_file, p_mp4->i_track, last_duration );
             total_duration = gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track );
         }
@@ -212,6 +212,7 @@ static int set_param( hnd_t handle, x264_param_t *p_param )
             dw *= sar ;
         else
             dh /= sar;
+        gf_isom_set_pixel_aspect_ratio( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_param->vui.i_sar_width, p_param->vui.i_sar_height );
         gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
     }
 
@@ -297,4 +298,4 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
     return i_size;
 }
 
-cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
+const cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/output/output.h b/output/output.h
index 851b819..c79b48e 100644
--- a/output/output.h
+++ b/output/output.h
@@ -33,9 +33,9 @@ typedef struct
     int (*close_file)( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts );
 } cli_output_t;
 
-extern cli_output_t raw_output;
-extern cli_output_t mkv_output;
-extern cli_output_t mp4_output;
-extern cli_output_t flv_output;
+extern const cli_output_t raw_output;
+extern const cli_output_t mkv_output;
+extern const cli_output_t mp4_output;
+extern const cli_output_t flv_output;
 
 #endif
diff --git a/output/raw.c b/output/raw.c
index a4d1175..02e4c56 100644
--- a/output/raw.c
+++ b/output/raw.c
@@ -62,5 +62,5 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
     return fclose( (FILE*)handle );
 }
 
-cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
+const cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
 
diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm
index 966615b..1970cb9 100644
--- a/tools/checkasm-a.asm
+++ b/tools/checkasm-a.asm
@@ -71,19 +71,19 @@ cglobal x264_checkasm_call, 4,7,16
 %endrep
 %assign i 6
 %rep 16-6
-    movdqa xmm %+ i, [x %+ i GLOBAL]
+    movdqa xmm %+ i, [x %+ i]
     %assign i i+1
 %endrep
-    mov  r4, [n4 GLOBAL]
-    mov  r5, [n5 GLOBAL]
+    mov  r4, [n4]
+    mov  r5, [n5]
     call r6
-    xor  r4, [n4 GLOBAL]
-    xor  r5, [n5 GLOBAL]
+    xor  r4, [n4]
+    xor  r5, [n5]
     or   r4, r5
     pxor xmm5, xmm5
 %assign i 6
 %rep 16-6
-    pxor xmm %+ i, [x %+ i GLOBAL]
+    pxor xmm %+ i, [x %+ i]
     por  xmm5, xmm %+ i
     %assign i i+1
 %endrep
@@ -92,7 +92,7 @@ cglobal x264_checkasm_call, 4,7,16
     or   r4, r5
     jz .ok
     mov  r4, rax
-    lea  r0, [error_message GLOBAL]
+    lea  r0, [error_message]
     call puts
     mov  r1, [rsp+stack_offset+16]
     mov  dword [r1], 0
@@ -132,7 +132,7 @@ cglobal x264_checkasm_call, 1,7
     or   r3, r5
     jz .ok
     mov  r3, eax
-    lea  r1, [error_message GLOBAL]
+    lea  r1, [error_message]
     push r1
     call puts
     add  esp, 4
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 0bedc5b..595bd9e 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1662,13 +1662,13 @@ static int check_all_flags( void )
         cpu1 &= ~X264_CPU_CACHELINE_64;
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
     }
-#elif ARCH_PPC
+#elif defined(ARCH_PPC)
     if( x264_cpu_detect() & X264_CPU_ALTIVEC )
     {
         fprintf( stderr, "x264: ALTIVEC against C\n" );
         ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
     }
-#elif ARCH_ARM
+#elif defined(ARCH_ARM)
     if( x264_cpu_detect() & X264_CPU_ARMV6 )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
     if( x264_cpu_detect() & X264_CPU_NEON )
diff --git a/x264.c b/x264.c
index 58bc1f4..959626a 100644
--- a/x264.c
+++ b/x264.c
@@ -250,23 +250,23 @@ static void Help( x264_param_t *defaults, int longhelp )
         "                                  - faster:\n"
         "                                    --no-mbtree --no-mixed-refs --ref 2\n"
         "                                    --subme 4 --weightp 1\n"
-        "                                  - fast\n"
+        "                                  - fast:\n"
         "                                    --rc-lookahead 30 --ref 2 --subme 6\n"
-        "                                  - medium\n"
+        "                                  - medium:\n"
         "                                    Default settings apply.\n"
-        "                                  - slow\n"
+        "                                  - slow:\n"
         "                                    --b-adapt 2 --direct auto --me umh\n"
         "                                    --rc-lookahead 50 --ref 5 --subme 8\n"
-        "                                  - slower\n"
+        "                                  - slower:\n"
         "                                    --b-adapt 2 --direct auto --me umh\n"
         "                                    --partitions all --rc-lookahead 60\n"
         "                                    --ref 8 --subme 9 --trellis 2\n"
-        "                                  - veryslow\n"
+        "                                  - veryslow:\n"
         "                                    --b-adapt 2 --bframes 8 --direct auto\n"
         "                                    --me umh --merange 24 --partitions all\n"
         "                                    --ref 16 --subme 10 --trellis 2\n"
         "                                    --rc-lookahead 60\n"
-        "                                  - placebo\n"
+        "                                  - placebo:\n"
         "                                    --bframes 16 --b-adapt 2 --direct auto\n"
         "                                    --slow-firstpass --no-fast-pskip\n"
         "                                    --me tesa --merange 24 --partitions all\n"
@@ -281,31 +281,26 @@ static void Help( x264_param_t *defaults, int longhelp )
         "                                  Only one psy tuning can be used at a time.\n" );
     H2( "                                  - film (psy tuning):\n"
         "                                    --deblock -1:-1 --psy-rd <unset>:0.15\n"
-        "                                  - animation(psy tuning):\n"
+        "                                  - animation (psy tuning):\n"
         "                                    --bframes {+2} --deblock 1:1\n"
         "                                    --psy-rd 0.4:<unset> --aq-strength 0.6\n"
         "                                    --ref {Double if >1 else 1}\n"
-        "                                  - grain(psy tuning):\n"
+        "                                  - grain (psy tuning):\n"
         "                                    --aq-strength 0.5 --no-dct-decimate\n"
         "                                    --deadzone-inter 6 --deadzone-intra 6\n"
         "                                    --deblock -2:-2 --ipratio 1.1 \n"
         "                                    --pbratio 1.1 --psy-rd <unset>:0.25\n"
         "                                    --qcomp 0.8\n"
-        "                                  - psnr(psy tuning):\n"
+        "                                  - psnr (psy tuning):\n"
         "                                    --aq-mode 0 --no-psy\n"
-        "                                  - ssim(psy tuning):\n"
+        "                                  - ssim (psy tuning):\n"
         "                                    --aq-mode 2 --no-psy\n"
         "                                  - fastdecode:\n"
         "                                    --no-cabac --no-deblock --no-weightb\n"
         "                                    --weightp 0\n"
         "                                  - zerolatency:\n"
         "                                    --bframes 0 --rc-lookahead 0\n"
-        "                                    --sync-lookahead 0 --sliced-threads\n"
-        "                                  - touhou(psy tuning):\n"
-        "                                    --aq-strength 1.3 --deblock -1:-1\n"
-        "                                    --partitions {p4x4 if p8x8 set}\n"
-        "                                    --psy-rd <unset>:0.2\n"
-        "                                    --ref {Double if >1 else 1}\n" );
+        "                                    --sync-lookahead 0 --sliced-threads\n" );
     else H0( "                                  - psy tunings: film,animation,grain,psnr,ssim\n"
              "                                  - other tunings: fastdecode,zerolatency\n" );
     H1( "      --slow-firstpass        Don't use faster settings with --pass 1\n" );
diff --git a/x264.h b/x264.h
index 2550864..e7d19b7 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 84
+#define X264_BUILD 85
 
 /* x264_t:
  *      opaque handler for encoder */
@@ -480,11 +480,12 @@ typedef struct
 x264_t *x264_encoder_open( x264_param_t * );
 
 /* x264_encoder_reconfig:
- *      analysis-related parameters from x264_param_t are copied.
+ *      various parameters from x264_param_t are copied.
  *      this takes effect immediately, on whichever frame is encoded next;
  *      due to delay, this may not be the next frame passed to encoder_encode.
  *      if the change should apply to some particular frame, use x264_picture_t->param instead.
- *      returns 0 on success, negative on parameter validation error. */
+ *      returns 0 on success, negative on parameter validation error.
+ *      not all parameters can be changed; see the actual function for a detailed breakdown. */
 int     x264_encoder_reconfig( x264_t *, x264_param_t * );
 /* x264_encoder_parameters:
  *      copies the current internal set of parameters to the pointer provided

-- 
x264 packaging