[SCM] x264 packaging branch, ubuntu, updated. debian/0.85.1442.1+git781d30-1-2-g11c4c4e
siretart at users.alioth.debian.org
siretart at users.alioth.debian.org
Sat Feb 20 11:22:36 UTC 2010
The following commit has been merged in the ubuntu branch:
commit 4f815c28198ee157915dd4147b55563078ca59d1
Author: Reinhard Tartler <siretart at tauware.de>
Date: Sat Feb 20 10:14:59 2010 +0100
Imported Upstream version 0.85.1442.1+git781d30
diff --git a/common/arm/asm.S b/common/arm/asm.S
index d163165..395267f 100644
--- a/common/arm/asm.S
+++ b/common/arm/asm.S
@@ -20,6 +20,12 @@
#include "config.h"
+#ifdef PREFIX
+# define EXTERN_ASM _
+#else
+# define EXTERN_ASM
+#endif
+
#ifdef __ELF__
# define ELF
#else
@@ -35,7 +41,8 @@ ELF .eabi_attribute 25, \val
.endm
.macro function name
- .global \name
+ .global EXTERN_ASM\name
+EXTERN_ASM\name:
ELF .hidden \name
ELF .type \name, %function
.func \name
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index a62af39..e1db404 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -432,6 +432,311 @@ avg2_w20_loop:
.endfunc
+.macro weight_prologue type
+ push {r4-r5,lr}
+ ldr r4, [sp, #4*3] // weight_t
+ ldr ip, [sp, #4*3+4] // h
+.ifc \type, full
+ ldr lr, [r4, #32] // denom
+.endif
+ ldrd r4, [r4, #32+4] // scale, offset
+ vdup.16 q0, r4
+ vdup.16 q1, r5
+.ifc \type, full
+ rsb lr, lr, #0
+ vdup.16 q2, lr
+.endif
+.endm
+
+// void mc_weight( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+// const x264_weight_t *weight, int height )
+function x264_mc_weight_w20_neon
+ weight_prologue full
+ sub r1, #16
+weight20_loop:
+ subs ip, #2
+ vld1.8 {d17-d19}, [r2], r3
+ vmovl.u8 q10, d17
+ vmovl.u8 q11, d18
+ vmovl.u8 q14, d19
+ vld1.8 {d16-d18}, [r2], r3
+ vmovl.u8 q12, d16
+ vmovl.u8 q13, d17
+ vmovl.u8 q15, d18
+ vmul.s16 q10, q10, q0
+ vmul.s16 q11, q11, q0
+ vmul.s16 q12, q12, q0
+ vmul.s16 q13, q13, q0
+ vmul.s16 d28, d28, d0
+ vmul.s16 d29, d30, d0
+ vrshl.s16 q10, q10, q2
+ vrshl.s16 q11, q11, q2
+ vrshl.s16 q12, q12, q2
+ vrshl.s16 q13, q13, q2
+ vrshl.s16 q14, q14, q2
+ vadd.s16 q10, q10, q1
+ vadd.s16 q11, q11, q1
+ vadd.s16 q12, q12, q1
+ vadd.s16 q13, q13, q1
+ vadd.s16 q14, q14, q1
+ vqmovun.s16 d16, q10
+ vqmovun.s16 d17, q11
+ vqmovun.s16 d18, q12
+ vqmovun.s16 d19, q13
+ vqmovun.s16 d20, q14
+ vst1.8 {d16-d17}, [r0,:128]!
+ vst1.32 {d20[0]}, [r0,:32], r1
+ vst1.8 {d18-d19}, [r0,:128]!
+ vst1.32 {d20[1]}, [r0,:32], r1
+ bgt weight20_loop
+ pop {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w16_neon
+ weight_prologue full
+weight16_loop:
+ subs ip, #2
+ vld1.8 {d16-d17}, [r2], r3
+ vld1.8 {d18-d19}, [r2], r3
+ vmovl.u8 q10, d16
+ vmovl.u8 q11, d17
+ vmovl.u8 q12, d18
+ vmovl.u8 q13, d19
+ vmul.s16 q10, q10, q0
+ vmul.s16 q11, q11, q0
+ vmul.s16 q12, q12, q0
+ vmul.s16 q13, q13, q0
+ vrshl.s16 q10, q10, q2
+ vrshl.s16 q11, q11, q2
+ vrshl.s16 q12, q12, q2
+ vrshl.s16 q13, q13, q2
+ vadd.s16 q10, q10, q1
+ vadd.s16 q11, q11, q1
+ vadd.s16 q12, q12, q1
+ vadd.s16 q13, q13, q1
+ vqmovun.s16 d16, q10
+ vqmovun.s16 d17, q11
+ vqmovun.s16 d18, q12
+ vqmovun.s16 d19, q13
+ vst1.8 {d16-d17}, [r0,:128], r1
+ vst1.8 {d18-d19}, [r0,:128], r1
+ bgt weight16_loop
+ pop {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w8_neon
+ weight_prologue full
+weight8_loop:
+ subs ip, #2
+ vld1.8 {d16}, [r2], r3
+ vld1.8 {d18}, [r2], r3
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmul.s16 q8, q8, q0
+ vmul.s16 q9, q9, q0
+ vrshl.s16 q8, q8, q2
+ vrshl.s16 q9, q9, q2
+ vadd.s16 q8, q8, q1
+ vadd.s16 q9, q9, q1
+ vqmovun.s16 d16, q8
+ vqmovun.s16 d18, q9
+ vst1.8 {d16}, [r0,:64], r1
+ vst1.8 {d18}, [r0,:64], r1
+ bgt weight8_loop
+ pop {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w4_neon
+ weight_prologue full
+weight4_loop:
+ subs ip, #2
+ vld1.32 {d16[]}, [r2], r3
+ vld1.32 {d18[]}, [r2], r3
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmul.s16 d16, d16, d0
+ vmul.s16 d17, d18, d0
+ vrshl.s16 q8, q8, q2
+ vadd.s16 q8, q8, q1
+ vqmovun.s16 d16, q8
+ vst1.32 {d16[0]}, [r0,:32], r1
+ vst1.32 {d16[1]}, [r0,:32], r1
+ bgt weight4_loop
+ pop {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w20_nodenom_neon
+ weight_prologue nodenom
+ sub r1, #16
+weight20_nodenom_loop:
+ subs ip, #2
+ vld1.8 {d17-d19}, [r2], r3
+ vmovl.u8 q10, d17
+ vmovl.u8 q11, d18
+ vmovl.u8 q14, d19
+ vld1.8 {d16-d18}, [r2], r3
+ vmovl.u8 q12, d16
+ vmovl.u8 q13, d17
+ vmovl.u8 q15, d18
+ vmov q8, q1
+ vmov q9, q1
+ vmla.s16 q8, q10, q0
+ vmla.s16 q9, q11, q0
+ vmov q10, q1
+ vmov q11, q1
+ vmla.s16 q10, q12, q0
+ vmla.s16 q11, q13, q0
+ vmov q12, q1
+ vmla.s16 d24, d28, d0
+ vmla.s16 d25, d30, d0
+ vqmovun.s16 d16, q8
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ vqmovun.s16 d20, q12
+ vst1.8 {d16-d17}, [r0,:128]!
+ vst1.32 {d20[0]}, [r0,:32], r1
+ vst1.8 {d18-d19}, [r0,:128]!
+ vst1.32 {d20[1]}, [r0,:32], r1
+ bgt weight20_nodenom_loop
+ pop {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w16_nodenom_neon
+ weight_prologue nodenom
+weight16_nodenom_loop:
+ subs ip, #2
+ vld1.8 {d16-d17}, [r2], r3
+ vld1.8 {d18-d19}, [r2], r3
+ vmovl.u8 q12, d16
+ vmovl.u8 q13, d17
+ vmovl.u8 q14, d18
+ vmovl.u8 q15, d19
+ vmov q8, q1
+ vmov q9, q1
+ vmov q10, q1
+ vmov q11, q1
+ vmla.s16 q8, q12, q0
+ vmla.s16 q9, q13, q0
+ vmla.s16 q10, q14, q0
+ vmla.s16 q11, q15, q0
+ vqmovun.s16 d16, q8
+ vqmovun.s16 d17, q9
+ vqmovun.s16 d18, q10
+ vqmovun.s16 d19, q11
+ vst1.8 {d16-d17}, [r0,:128], r1
+ vst1.8 {d18-d19}, [r0,:128], r1
+ bgt weight16_nodenom_loop
+ pop {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w8_nodenom_neon
+ weight_prologue nodenom
+weight8_nodenom_loop:
+ subs ip, #2
+ vld1.8 {d16}, [r2], r3
+ vld1.8 {d18}, [r2], r3
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmov q10, q1
+ vmov q11, q1
+ vmla.s16 q10, q8, q0
+ vmla.s16 q11, q9, q0
+ vqmovun.s16 d16, q10
+ vqmovun.s16 d17, q11
+ vst1.8 {d16}, [r0,:64], r1
+ vst1.8 {d17}, [r0,:64], r1
+ bgt weight8_nodenom_loop
+ pop {r4-r5,pc}
+.endfunc
+
+function x264_mc_weight_w4_nodenom_neon
+ weight_prologue nodenom
+weight4_nodenom_loop:
+ subs ip, #2
+ vld1.32 {d16[]}, [r2], r3
+ vld1.32 {d18[]}, [r2], r3
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmov q10, q1
+ vmla.s16 d20, d16, d0
+ vmla.s16 d21, d18, d0
+ vqmovun.s16 d16, q10
+ vst1.32 {d16[0]}, [r0,:32], r1
+ vst1.32 {d16[1]}, [r0,:32], r1
+ bgt weight4_nodenom_loop
+ pop {r4-r5,pc}
+.endfunc
+
+.macro weight_simple_prologue
+ push {lr}
+ ldr lr, [sp, #4] // weight_t
+ ldr ip, [sp, #8] // h
+ ldr lr, [lr] // offset
+ vdup.8 q1, lr
+.endm
+
+.macro weight_simple name op
+function x264_mc_weight_w20_\name\()_neon
+ weight_simple_prologue
+weight20_\name\()_loop:
+ subs ip, #2
+ vld1.8 {d16-d18}, [r2], r3
+ vld1.8 {d19-d21}, [r2], r3
+ \op q8, q8, q1
+ \op q9, q9, q1
+ \op q10, q10, q1
+ vst1.8 {d16-d18}, [r0,:64], r1
+ vst1.8 {d19-d21}, [r0,:64], r1
+ bgt weight20_\name\()_loop
+ pop {pc}
+.endfunc
+
+function x264_mc_weight_w16_\name\()_neon
+ weight_simple_prologue
+weight16_\name\()_loop:
+ subs ip, #2
+ vld1.8 {d16-d17}, [r2], r3
+ vld1.8 {d18-d19}, [r2], r3
+ \op q8, q8, q1
+ \op q9, q9, q1
+ vst1.8 {d16-d17}, [r0,:128], r1
+ vst1.8 {d18-d19}, [r0,:128], r1
+ bgt weight16_\name\()_loop
+ pop {pc}
+.endfunc
+
+function x264_mc_weight_w8_\name\()_neon
+ weight_simple_prologue
+weight8_\name\()_loop:
+ subs ip, #2
+ vld1.8 {d16}, [r2], r3
+ vld1.8 {d17}, [r2], r3
+ \op q8, q8, q1
+ vst1.8 {d16}, [r0,:64], r1
+ vst1.8 {d17}, [r0,:64], r1
+ bgt weight8_\name\()_loop
+ pop {pc}
+.endfunc
+
+function x264_mc_weight_w4_\name\()_neon
+ weight_simple_prologue
+weight4_\name\()_loop:
+ subs ip, #2
+ vld1.32 {d16[]}, [r2], r3
+ vld1.32 {d17[]}, [r2], r3
+ \op q8, q8, q1
+ vst1.32 {d16[0]}, [r0,:32], r1
+ vst1.32 {d17[0]}, [r0,:32], r1
+ bgt weight4_\name\()_loop
+ pop {pc}
+.endfunc
+.endm
+
+weight_simple offsetadd, vqadd.u8
+weight_simple offsetsub, vqsub.u8
+
+
// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
function x264_mc_copy_w4_neon
ldr ip, [sp]
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index 20cf151..0a7b734 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -43,6 +43,48 @@ void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+#define MC_WEIGHT(func)\
+void x264_mc_weight_w20##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
+void x264_mc_weight_w16##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
+void x264_mc_weight_w8##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
+void x264_mc_weight_w4##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
+\
+static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
+{\
+ x264_mc_weight_w4##func##_neon,\
+ x264_mc_weight_w4##func##_neon,\
+ x264_mc_weight_w8##func##_neon,\
+ x264_mc_weight_w16##func##_neon,\
+ x264_mc_weight_w16##func##_neon,\
+ x264_mc_weight_w20##func##_neon,\
+};
+
+MC_WEIGHT()
+MC_WEIGHT(_nodenom)
+MC_WEIGHT(_offsetadd)
+MC_WEIGHT(_offsetsub)
+
+static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
+{
+ if( w->i_scale == 1<<w->i_denom )
+ {
+ if( w->i_offset < 0 )
+ {
+ w->weightfn = x264_mc_offsetsub_wtab_neon;
+ w->cachea[0] = -w->i_offset;
+ }
+ else
+ {
+ w->weightfn = x264_mc_offsetadd_wtab_neon;
+ w->cachea[0] = w->i_offset;
+ }
+ }
+ else if( !w->i_denom )
+ w->weightfn = x264_mc_nodenom_wtab_neon;
+ else
+ w->weightfn = x264_mc_wtab_neon;
+}
+
void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
@@ -182,6 +224,11 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
+ pf->weight = x264_mc_wtab_neon;
+ pf->offsetadd = x264_mc_offsetadd_wtab_neon;
+ pf->offsetsub = x264_mc_offsetsub_wtab_neon;
+ pf->weight_cache = x264_weight_cache_neon;
+
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
#ifndef SYS_MACOSX
pf->memcpy_aligned = x264_memcpy_aligned_neon;
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index 4dd65ed..d8533e5 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -110,16 +110,17 @@ SAD4_ARMV6 8
.macro SAD_FUNC w, h, name, align:vararg
function x264_pixel_sad\name\()_\w\()x\h\()_neon
+ SAD_START_\w \align
+
.if \w == 16
- .set r, \h / 2 - 1
+.rept \h / 2 - 1
+ SAD_\w \align
+.endr
.else
- .set r, \h - 1
-.endif
-
- SAD_START_\w \align
-.rept r
+.rept \h - 1
SAD_\w \align
.endr
+.endif
.if \w > 8
vabal.u8 q8, d4, d6
diff --git a/common/common.c b/common/common.c
index 6d1d7f0..0dd7af5 100644
--- a/common/common.c
+++ b/common/common.c
@@ -515,7 +515,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
OPT("psy-rd")
{
if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
- 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) )
+ 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ||
+ 2 == sscanf( value, "%f|%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ))
{ }
else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) )
{
@@ -886,7 +887,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
s += sprintf( s, " psy=%d", p->analyse.b_psy );
if( p->analyse.b_psy )
- s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
+ s += sprintf( s, " psy_rd=%.2f:%.2f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me );
diff --git a/common/common.h b/common/common.h
index 950f48f..e2e8fac 100644
--- a/common/common.h
+++ b/common/common.h
@@ -484,6 +484,7 @@ struct x264_t
int b_chroma_me;
int b_trellis;
int b_noise_reduction;
+ int b_dct_decimate;
int i_psy_rd; /* Psy RD strength--fixed point value*/
int i_psy_trellis; /* Psy trellis strength--fixed point value*/
@@ -653,11 +654,12 @@ struct x264_t
int i_chroma_lambda2_offset;
/* B_direct and weighted prediction */
- int16_t dist_scale_factor[16][2];
+ int16_t dist_scale_factor_buf[2][16][2];
+ int16_t (*dist_scale_factor)[2];
int8_t bipred_weight_buf[2][32][4];
int8_t (*bipred_weight)[4];
/* maps fref1[0]'s ref indices into the current list0 */
-#define map_col_to_list0(col) h->mb.map_col_to_list0[col+2]
+#define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2]
int8_t map_col_to_list0[18];
int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
} mb;
diff --git a/common/frame.c b/common/frame.c
index 40cc78f..d89f5ab 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -472,12 +472,14 @@ static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int a
int delta;
if( abs( p2 - p0 ) < beta )
{
- pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
+ if( tc0[i] )
+ pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0[i], tc0[i] );
tc++;
}
if( abs( q2 - q0 ) < beta )
{
- pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
+ if( tc0[i] )
+ pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0[i], tc0[i] );
tc++;
}
diff --git a/common/frame.h b/common/frame.h
index b1852b3..7c8e2ff 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -48,6 +48,7 @@ typedef struct x264_frame
uint8_t i_bframes; /* number of bframes following this nonb in coded order */
float f_qp_avg_rc; /* QPs as decided by ratecontrol */
float f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
+ int i_poc_l0ref0; /* poc of first refframe in L0, used to check if direct temporal is possible */
/* YUV buffer */
int i_plane;
diff --git a/common/macroblock.c b/common/macroblock.c
index 10f09ac..278659c 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -190,7 +190,8 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
const int x8 = i8%2;
const int y8 = i8/2;
const int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride;
- const int i_ref = map_col_to_list0(h->fref1[0]->ref[0][i_part_8x8]);
+ const int i_ref1_ref = h->fref1[0]->ref[0][i_part_8x8];
+ const int i_ref = (map_col_to_list0(i_ref1_ref>>h->sh.b_mbaff) << h->sh.b_mbaff) + (i_ref1_ref&h->sh.b_mbaff);
if( i_ref >= 0 )
{
@@ -271,6 +272,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, mv[0] );
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, mv[1] );
+ if( !M64( mv ) )
+ return 1;
+
if( h->param.i_threads > 1
&& ( mv[0][1] > h->mb.mv_max_spel[1]
|| mv[1][1] > h->mb.mv_max_spel[1] ) )
@@ -1238,6 +1242,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
if( h->sh.i_type == SLICE_TYPE_B )
{
h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced&(i_mb_y&1)];
+ h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[h->mb.b_interlaced&(i_mb_y&1)];
if( h->param.b_cabac )
{
uint8_t skipbp;
@@ -1477,9 +1482,7 @@ void x264_macroblock_bipred_init( x264_t *h )
dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
}
- // FIXME: will need this if we ever do temporal MV pred with interlaced
- if( !h->sh.b_mbaff )
- h->mb.dist_scale_factor[i_ref0][i_ref1] = dist_scale_factor;
+ h->mb.dist_scale_factor_buf[field][i_ref0][i_ref1] = dist_scale_factor;
dist_scale_factor >>= 2;
if( h->param.analyse.b_weighted_bipred
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index 29e05f1..62e281a 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -59,7 +59,7 @@ endstruc
%macro LOAD_GLOBAL 4
%ifdef PIC
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
- lea r11, [%2 GLOBAL]
+ lea r11, [%2]
%ifnidn %3, 0
add r11, %3
%endif
diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm
index a713dd6..3350e40 100644
--- a/common/x86/dct-32.asm
+++ b/common/x86/dct-32.asm
@@ -349,7 +349,7 @@ cglobal x264_sub8x8_dct_%1, 3,3
global x264_sub8x8_dct_%1.skip_prologue
.skip_prologue:
%ifnidn %1, sse2
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%endif
LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
SPILL r0, 1,2
@@ -393,7 +393,7 @@ global x264_sub8x8_dct8_%1.skip_prologue
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
%else
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
SPILL r0, 0,1
SWAP 1, 7
@@ -441,9 +441,9 @@ global x264_add8x8_idct_sse2.skip_prologue
SPILL r1, 0
TRANSPOSE2x4x4W 4,5,6,7,0
UNSPILL r1, 0
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
IDCT4_1D 0,1,2,3,r1
- paddw m4, [pw_32 GLOBAL]
+ paddw m4, [pw_32]
IDCT4_1D 4,5,6,7,r1
SPILL r1, 6,7
pxor m7, m7
@@ -466,7 +466,7 @@ global x264_add8x8_idct8_sse2.skip_prologue
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SPILL r1, 6
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
SPILL r1, 0
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SPILL r1, 6,7
diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
index 9915789..ba7741e 100644
--- a/common/x86/dct-64.asm
+++ b/common/x86/dct-64.asm
@@ -143,7 +143,7 @@ INIT_XMM
cglobal x264_sub8x8_dct_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%endif
%ifdef WIN64
call .skip_prologue
@@ -170,7 +170,7 @@ global x264_sub8x8_dct_%1.skip_prologue
cglobal x264_sub8x8_dct8_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%endif
%ifdef WIN64
call .skip_prologue
@@ -227,7 +227,7 @@ global x264_add8x8_idct8_sse2.skip_prologue
movdqa m7, [r1+0x70]
IDCT8_1D 0,1,2,3,4,5,6,7,8,10
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
- paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end
+ paddw m0, [pw_32] ; rounding for the >>6 at the end
IDCT8_1D 0,1,2,3,4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
@@ -265,9 +265,9 @@ global x264_add8x8_idct_sse2.skip_prologue
TRANSPOSE2x4x4W 0,1,2,3,8
IDCT4_1D 4,5,6,7,8,10
TRANSPOSE2x4x4W 4,5,6,7,8
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
IDCT4_1D 0,1,2,3,8,10
- paddw m4, [pw_32 GLOBAL]
+ paddw m4, [pw_32]
IDCT4_1D 4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index d4a0cae..618433c 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -80,7 +80,7 @@ cglobal x264_dct4x4dc_mmx, 1,1
movq m2, [r0+16]
movq m1, [r0+ 8]
movq m0, [r0+ 0]
- movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
+ movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
WALSH4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
SUMSUB_BADC m1, m0, m3, m2, m4
@@ -123,7 +123,7 @@ cglobal x264_sub4x4_dct_%1, 3,3
LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
%else
- mova m5, [hsub_mul GLOBAL]
+ mova m5, [hsub_mul]
LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
%endif
DCT4_1D 0,1,2,3,4
@@ -151,7 +151,7 @@ cglobal x264_add4x4_idct_mmx, 2,2
movq m0, [r1+ 0]
IDCT4_1D 0,1,2,3,4,5
TRANSPOSE4x4W 0,1,2,3,4
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
IDCT4_1D 0,1,2,3,4,5
STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
@@ -179,7 +179,7 @@ cglobal x264_add4x4_idct_sse4, 2,2,6
punpckhdq m2, m0
SWAP 0, 1
- mova m1, [pw_32_0 GLOBAL]
+ mova m1, [pw_32_0]
paddw m1, m0 ; row1/row0 corrected
psraw m0, 1 ; row1>>1/...
mova m3, m2 ; row3/row2
@@ -221,7 +221,7 @@ cglobal %1, 3,3,11
pxor m7, m7
%else
add r2, 4*FDEC_STRIDE
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%endif
.skip_prologue:
%ifdef WIN64
@@ -335,7 +335,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2
movq mm0, [r1]
pxor mm1, mm1
add r0, FDEC_STRIDE*4
- paddw mm0, [pw_32 GLOBAL]
+ paddw mm0, [pw_32]
psraw mm0, 6
psubw mm1, mm0
packuswb mm0, mm0
@@ -354,10 +354,10 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
movq xmm0, [r1]
pxor xmm1, xmm1
add r0, FDEC_STRIDE*4
- paddw xmm0, [pw_32 GLOBAL]
+ paddw xmm0, [pw_32]
psraw xmm0, 6
psubw xmm1, xmm0
- movdqa xmm5, [pb_idctdc_unpack GLOBAL]
+ movdqa xmm5, [pb_idctdc_unpack]
packuswb xmm0, xmm0
packuswb xmm1, xmm1
pshufb xmm0, xmm5
@@ -393,7 +393,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
.loop:
movq mm0, [r1]
pxor mm1, mm1
- paddw mm0, [pw_32 GLOBAL]
+ paddw mm0, [pw_32]
psraw mm0, 6
psubw mm1, mm0
packuswb mm0, mm0
@@ -447,8 +447,8 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2,8
punpcklwd xmm2, xmm2
pxor xmm1, xmm1
pxor xmm3, xmm3
- paddw xmm0, [pw_32 GLOBAL]
- paddw xmm2, [pw_32 GLOBAL]
+ paddw xmm0, [pw_32]
+ paddw xmm2, [pw_32]
psraw xmm0, 6
psraw xmm2, 6
psubw xmm1, xmm0
@@ -477,11 +477,11 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
movdqa xmm0, [r1]
add r1, 16
pxor xmm1, xmm1
- paddw xmm0, [pw_32 GLOBAL]
+ paddw xmm0, [pw_32]
psraw xmm0, 6
psubw xmm1, xmm0
- movdqa xmm5, [ pb_idctdc_unpack GLOBAL]
- movdqa xmm6, [pb_idctdc_unpack2 GLOBAL]
+ movdqa xmm5, [ pb_idctdc_unpack]
+ movdqa xmm6, [pb_idctdc_unpack2]
packuswb xmm0, xmm0
packuswb xmm1, xmm1
movdqa xmm2, xmm0
@@ -815,8 +815,8 @@ cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
movdqa xmm1, [r1+16]
movdqa xmm0, [r1]
- pshufb xmm1, [pb_scan4frameb GLOBAL]
- pshufb xmm0, [pb_scan4framea GLOBAL]
+ pshufb xmm1, [pb_scan4frameb]
+ pshufb xmm0, [pb_scan4framea]
movdqa xmm2, xmm1
psrldq xmm1, 6
palignr xmm2, xmm0, 6
@@ -963,9 +963,9 @@ cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
punpcklqdq xmm0, xmm2
punpcklqdq xmm4, xmm6
%ifidn %2, frame
- movdqa xmm7, [pb_sub4frame GLOBAL]
+ movdqa xmm7, [pb_sub4frame]
%else
- movdqa xmm7, [pb_sub4field GLOBAL]
+ movdqa xmm7, [pb_sub4field]
%endif
pshufb xmm0, xmm7
pshufb xmm4, xmm7
@@ -980,7 +980,7 @@ cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
psubw xmm1, xmm5
%ifidn %1, ac
movd r2d, xmm0
- pand xmm0, [pb_subacmask GLOBAL]
+ pand xmm0, [pb_subacmask]
%endif
movdqa [r0], xmm0
pxor xmm2, xmm2
@@ -1039,7 +1039,7 @@ cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
packsswb m5, m5
pxor m0, m0
pcmpeqb m5, m0
- paddb m5, [pb_1 GLOBAL]
+ paddb m5, [pb_1]
movd r0d, m5
mov [r2+0], r0w
shr r0d, 16
@@ -1085,7 +1085,7 @@ cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
packsswb m2, m2
packsswb m2, m2
pcmpeqb m5, m2
- paddb m5, [pb_1 GLOBAL]
+ paddb m5, [pb_1]
movd r0d, m5
mov [r2+0], r0w
shr r0d, 16
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index 75b308f..00d0418 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -233,19 +233,19 @@ SECTION .text
; clobbers: m0,3-6
%macro DEBLOCK_P0_Q0 0
mova m5, m1
- pxor m5, m2 ; p0^q0
- pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
+ pxor m5, m2 ; p0^q0
+ pand m5, [pb_01] ; (p0^q0)&1
pcmpeqb m4, m4
pxor m3, m4
- pavgb m3, m0 ; (p1 - q1 + 256)>>1
- pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
+ pavgb m3, m0 ; (p1 - q1 + 256)>>1
+ pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
pxor m4, m1
- pavgb m4, m2 ; (q0 - p0 + 256)>>1
+ pavgb m4, m2 ; (q0 - p0 + 256)>>1
pavgb m3, m5
- paddusb m3, m4 ; d+128+33
- mova m6, [pb_a1 GLOBAL]
+ paddusb m3, m4 ; d+128+33
+ mova m6, [pb_a1]
psubusb m6, m3
- psubusb m3, [pb_a1 GLOBAL]
+ psubusb m3, [pb_a1]
pminub m6, m7
pminub m3, m7
psubusb m1, m6
@@ -261,10 +261,10 @@ SECTION .text
%macro LUMA_Q1 6
mova %6, m1
pavgb %6, m2
- pavgb %2, %6 ; avg(p2,avg(p0,q0))
+ pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3
- pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
- psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
+ pand %6, [pb_01] ; (p2^avg(p0,q0))&1
+ psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
mova %6, %1
psubusb %6, %5
paddusb %5, %1
@@ -614,8 +614,8 @@ DEBLOCK_LUMA sse2, v, 16
%define mask0 spill(2)
%define mask1p spill(3)
%define mask1q spill(4)
- %define mpb_00 [pb_00 GLOBAL]
- %define mpb_01 [pb_01 GLOBAL]
+ %define mpb_00 [pb_00]
+ %define mpb_01 [pb_01]
%endif
;-----------------------------------------------------------------------------
@@ -639,7 +639,7 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
mova q1, [r0+r1]
%ifdef ARCH_X86_64
pxor mpb_00, mpb_00
- mova mpb_01, [pb_01 GLOBAL]
+ mova mpb_01, [pb_01]
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
SWAP 7, 12 ; m12=mask0
pavgb t5, mpb_00
@@ -658,8 +658,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
mova m4, t5
mova mask0, m7
- pavgb m4, [pb_00 GLOBAL]
- pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
+ pavgb m4, [pb_00]
+ pavgb m4, [pb_01] ; alpha/4+1
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
pand m6, mask0
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
@@ -835,7 +835,7 @@ chroma_inter_body_mmxext:
%macro CHROMA_INTRA_P0 3
movq m4, %1
pxor m4, %3
- pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1
+ pand m4, [pb_01] ; m4 = (p0^q1)&1
pavgb %1, %3
psubusb %1, m4
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index f486a8d..9783066 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -89,9 +89,9 @@ SECTION .text
%macro BIWEIGHT_START_MMX 0
movd m2, r6m
SPLATW m2, m2 ; weight_dst
- mova m3, [pw_64 GLOBAL]
+ mova m3, [pw_64]
psubw m3, m2 ; weight_src
- mova m4, [pw_32 GLOBAL] ; rounding
+ mova m4, [pw_32] ; rounding
pxor m5, m5
%endmacro
@@ -111,7 +111,7 @@ SECTION .text
shl t7d, 8
add t6d, t7d
movd m3, t6d
- mova m4, [pw_32 GLOBAL]
+ mova m4, [pw_32]
SPLATW m3, m3 ; weight_dst,src
%endmacro
@@ -641,7 +641,7 @@ AVG2_W20 sse2_misalign
%macro INIT_SHIFT 2
and eax, 7
shl eax, 3
- movd %1, [sw_64 GLOBAL]
+ movd %1, [sw_64]
movd %2, eax
psubw %1, %2
%endmacro
@@ -778,10 +778,10 @@ cglobal x264_pixel_avg2_w16_cache64_ssse3
shl r6, 4 ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
%ifdef PIC
- lea r11, [avg_w16_addr GLOBAL]
+ lea r11, [avg_w16_addr]
add r6, r11
%else
- lea r6, [avg_w16_addr + r6 GLOBAL]
+ lea r6, [avg_w16_addr + r6]
%endif
%ifdef UNIX64
jmp r6
@@ -1007,7 +1007,7 @@ cglobal x264_mc_chroma_%1
SPLATW m5, m5 ; m5 = dx
SPLATW m6, m6 ; m6 = dy
- mova m4, [pw_8 GLOBAL]
+ mova m4, [pw_8]
mova m0, m4
psubw m4, m5 ; m4 = 8-dx
psubw m0, m6 ; m0 = 8-dy
@@ -1042,7 +1042,7 @@ cglobal x264_mc_chroma_%1
punpcklbw m2, m3
punpcklbw m1, m3
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
pmullw m2, m5 ; line * cB
pmullw m1, m7 ; line * cD
@@ -1084,9 +1084,9 @@ cglobal x264_mc_chroma_%1
movd m6, r4d
mov r5d, 1
.mc1d:
- mova m5, [pw_8 GLOBAL]
+ mova m5, [pw_8]
SPLATW m6, m6
- mova m7, [pw_4 GLOBAL]
+ mova m7, [pw_4]
psubw m5, m6
movifnidn r0, r0mp
movifnidn r1d, r1m
@@ -1166,7 +1166,7 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
imul r4d, t0d ; (x*255+8)*(8-y)
cmp dword r6m, 4
jg .width8
- mova m5, [pw_32 GLOBAL]
+ mova m5, [pw_32]
movd m6, r5d
movd m7, r4d
movifnidn r0, r0mp
@@ -1178,10 +1178,10 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
and r2, ~3
and r5, 3
%ifdef PIC
- lea r11, [ch_shuffle GLOBAL]
+ lea r11, [ch_shuffle]
movu m5, [r11 + r5*2]
%else
- movu m5, [ch_shuffle + r5*2 GLOBAL]
+ movu m5, [ch_shuffle + r5*2]
%endif
movu m0, [r2]
pshufb m0, m5
@@ -1197,8 +1197,8 @@ cglobal x264_mc_chroma_ssse3%1, 0,6,%2
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
- paddw m0, [pw_32 GLOBAL]
- paddw m2, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
+ paddw m2, [pw_32]
paddw m1, m0
paddw m3, m2
mova m0, m4
@@ -1228,7 +1228,7 @@ INIT_XMM
cmp r5, 0x38
jge .split
%endif
- mova m5, [pw_32 GLOBAL]
+ mova m5, [pw_32]
movh m0, [r2]
movh m1, [r2+1]
punpcklbw m0, m1
@@ -1265,18 +1265,18 @@ INIT_XMM
and r2, ~7
and r5, 7
%ifdef PIC
- lea r11, [ch_shuffle GLOBAL]
+ lea r11, [ch_shuffle]
movu m5, [r11 + r5*2]
%else
- movu m5, [ch_shuffle + r5*2 GLOBAL]
+ movu m5, [ch_shuffle + r5*2]
%endif
movu m0, [r2]
pshufb m0, m5
%ifdef ARCH_X86_64
- mova m8, [pw_32 GLOBAL]
+ mova m8, [pw_32]
%define round m8
%else
- %define round [pw_32 GLOBAL]
+ %define round [pw_32]
%endif
.splitloop8:
movu m1, [r2+r3]
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 245c09f..f2e69c0 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -125,7 +125,7 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
%ifnidn %1, ssse3
pxor m0, m0
%else
- mova m0, [filt_mul51 GLOBAL]
+ mova m0, [filt_mul51]
%endif
.loop:
%ifidn %1, ssse3
@@ -142,8 +142,8 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
pmaddubsw m4, m0
pmaddubsw m2, m0
pmaddubsw m5, m0
- pmaddubsw m3, [filt_mul20 GLOBAL]
- pmaddubsw m6, [filt_mul20 GLOBAL]
+ pmaddubsw m3, [filt_mul20]
+ pmaddubsw m6, [filt_mul20]
paddw m1, m2
paddw m4, m5
paddw m1, m3
@@ -155,7 +155,7 @@ cglobal x264_hpel_filter_v_%1, 5,6,%2
LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
FILT_V2
%endif
- mova m7, [pw_16 GLOBAL]
+ mova m7, [pw_16]
mova [r2+r4*2], m1
mova [r2+r4*2+mmsize], m4
paddw m1, m7
@@ -180,7 +180,7 @@ cglobal x264_hpel_filter_c_mmxext, 3,3
lea r1, [r1+r2*2]
neg r2
%define src r1+r2*2
- movq m7, [pw_32 GLOBAL]
+ movq m7, [pw_32]
.loop:
movq m1, [src-4]
movq m2, [src-2]
@@ -237,7 +237,7 @@ cglobal x264_hpel_filter_h_mmxext, 3,3
punpcklbw m7, m0
punpcklbw m6, m0
paddw m6, m7 ; a1
- movq m7, [pw_1 GLOBAL]
+ movq m7, [pw_1]
FILT_H2 m1, m2, m3, m4, m5, m6
FILT_PACK m1, m4, 1
movntq [r0+r2], m1
@@ -257,13 +257,13 @@ cglobal x264_hpel_filter_c_%1, 3,3,9
neg r2
%define src r1+r2*2
%ifidn %1, ssse3
- mova m7, [pw_32 GLOBAL]
+ mova m7, [pw_32]
%define tpw_32 m7
%elifdef ARCH_X86_64
- mova m8, [pw_32 GLOBAL]
+ mova m8, [pw_32]
%define tpw_32 m8
%else
- %define tpw_32 [pw_32 GLOBAL]
+ %define tpw_32 [pw_32]
%endif
.loop:
%ifidn %1,sse2_misalign
@@ -340,7 +340,7 @@ cglobal x264_hpel_filter_h_sse2, 3,3,8
punpcklbw m6, m0
punpcklbw m7, m0
paddw m6, m7 ; c1
- mova m7, [pw_1 GLOBAL] ; FIXME xmm8
+ mova m7, [pw_1] ; FIXME xmm8
FILT_H2 m1, m2, m3, m4, m5, m6
FILT_PACK m1, m4, 1
movntdq [r0+r2], m1
@@ -362,7 +362,7 @@ cglobal x264_hpel_filter_h_ssse3, 3,3
punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
movh m2, [src]
punpcklbw m2, m0
- mova m7, [pw_1 GLOBAL]
+ mova m7, [pw_1]
.loop:
movh m3, [src+8]
punpcklbw m3, m0
@@ -436,7 +436,7 @@ HPEL_V ssse3
mova m3, [r1]
mova %4, [r1+r2]
mova m0, [r1+r2*2]
- mova %2, [filt_mul51 GLOBAL]
+ mova %2, [filt_mul51]
mova m4, m1
punpcklbw m1, m2
punpckhbw m4, m2
@@ -452,8 +452,8 @@ HPEL_V ssse3
pmaddubsw m4, %2
pmaddubsw m0, %2
pmaddubsw m2, %2
- pmaddubsw m3, [filt_mul20 GLOBAL]
- pmaddubsw %1, [filt_mul20 GLOBAL]
+ pmaddubsw m3, [filt_mul20]
+ pmaddubsw %1, [filt_mul20]
psrlw %3, 8
psrlw %4, 8
paddw m1, m0
@@ -1096,7 +1096,7 @@ cglobal x264_mbtree_propagate_cost_sse2, 6,6
add r4, r5
neg r5
pxor xmm5, xmm5
- movdqa xmm4, [pd_128 GLOBAL]
+ movdqa xmm4, [pd_128]
.loop:
movq xmm2, [r2+r5] ; intra
movq xmm0, [r4+r5] ; invq
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index d94daaf..46b4557 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -59,7 +59,7 @@ SECTION .text
%endmacro
%macro HADDW 2
- pmaddwd %1, [pw_1 GLOBAL]
+ pmaddwd %1, [pw_1]
HADDD %1, %2
%endmacro
@@ -244,9 +244,9 @@ cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
%endif
%ifidn %3, ssse3
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%elifidn %3, sse2
- mova m7, [pw_00ff GLOBAL]
+ mova m7, [pw_00ff]
%elif %1 >= mmsize
pxor m7, m7
%endif
@@ -310,7 +310,7 @@ SSD 4, 8, ssse3
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
%if %1
- mova m7, [pw_00ff GLOBAL]
+ mova m7, [pw_00ff]
%else
pxor m7, m7 ; zero
%endif
@@ -482,7 +482,7 @@ cglobal x264_pixel_var2_8x8_sse2, 5,6,8
cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
mov r5d, 2
.loop:
movq m0, [r0]
@@ -775,7 +775,7 @@ cglobal x264_pixel_satd_4x4_mmxext, 4,6
%macro SATD_START_SSE2 3
%ifnidn %1, sse2
- mova %3, [hmul_8p GLOBAL]
+ mova %3, [hmul_8p]
%endif
lea r4, [3*r1]
lea r5, [3*r3]
@@ -815,7 +815,7 @@ INIT_XMM
%ifnidn %1, sse2
cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
SATD_START_MMX
- mova m4, [hmul_4p GLOBAL]
+ mova m4, [hmul_4p]
LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
@@ -832,7 +832,7 @@ cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
SATD_START_MMX
%ifnidn %1, sse2
- mova m7, [hmul_4p GLOBAL]
+ mova m7, [hmul_4p]
%endif
movd m4, [r2]
movd m5, [r2+r3]
@@ -889,14 +889,14 @@ cglobal x264_pixel_satd_16x4_internal_%1
cglobal x264_pixel_satd_16x8_%1, 4,6,12
SATD_START_SSE2 %1, m10, m7
%ifidn %1, sse2
- mova m7, [pw_00ff GLOBAL]
+ mova m7, [pw_00ff]
%endif
jmp x264_pixel_satd_16x8_internal_%1
cglobal x264_pixel_satd_16x16_%1, 4,6,12
SATD_START_SSE2 %1, m10, m7
%ifidn %1, sse2
- mova m7, [pw_00ff GLOBAL]
+ mova m7, [pw_00ff]
%endif
call x264_pixel_satd_16x4_internal_%1
call x264_pixel_satd_16x4_internal_%1
@@ -977,7 +977,7 @@ cglobal x264_pixel_sa8d_8x8_%1, 4,6,12
lea r4, [3*r1]
lea r5, [3*r3]
%ifnidn %1, sse2
- mova m7, [hmul_8p GLOBAL]
+ mova m7, [hmul_8p]
%endif
call x264_pixel_sa8d_8x8_internal_%1
HADDW m0, m1
@@ -990,7 +990,7 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,6,12
lea r4, [3*r1]
lea r5, [3*r3]
%ifnidn %1, sse2
- mova m7, [hmul_8p GLOBAL]
+ mova m7, [hmul_8p]
%endif
call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
add r2, 8
@@ -1029,7 +1029,7 @@ cglobal x264_pixel_sa8d_8x8_internal_%1
paddw m0, m1
HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
%else ; non-sse2
- mova m7, [hmul_8p GLOBAL]
+ mova m7, [hmul_8p]
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
; could do first HADAMARD4_V here to save spilling later
; surprisingly, not a win on conroe or even p4
@@ -1221,7 +1221,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
paddusw m2, m0
; 3x HADDW
- movdqa m7, [pw_1 GLOBAL]
+ movdqa m7, [pw_1]
pmaddwd m2, m7
pmaddwd m14, m7
pmaddwd m15, m7
@@ -1650,7 +1650,7 @@ cglobal x264_hadamard_ac_2x2max_mmxext
ret
cglobal x264_hadamard_ac_8x8_mmxext
- mova m6, [mask_ac4 GLOBAL]
+ mova m6, [mask_ac4]
pxor m7, m7
call x264_hadamard_ac_4x4_mmxext
add r0, 4
@@ -1727,7 +1727,7 @@ cglobal x264_pixel_hadamard_ac_%1x%2_mmxext, 2,4
mova m3, m0
paddusw m1, [rsp+0x38]
pxor m3, m2
- pand m3, [pw_1 GLOBAL]
+ pand m3, [pw_1]
pavgw m0, m2
psubusw m0, m3
HADDUW m0, m2
@@ -1791,7 +1791,7 @@ cglobal x264_hadamard_ac_8x8_%1
%endif
%ifnidn %1, sse2
;LOAD_INC loads sumsubs
- mova m7, [hmul_8p GLOBAL]
+ mova m7, [hmul_8p]
%else
;LOAD_INC only unpacks to words
pxor m7, m7
@@ -1834,9 +1834,9 @@ cglobal x264_hadamard_ac_8x8_%1
paddw m1, m2
SUMSUB_BA m0, m4; m2
%ifnidn %1, sse2
- pand m1, [mask_ac4b GLOBAL]
+ pand m1, [mask_ac4b]
%else
- pand m1, [mask_ac4 GLOBAL]
+ pand m1, [mask_ac4]
%endif
ABS_MOV m2, spill0
paddw m1, m3
@@ -1878,7 +1878,7 @@ cglobal x264_hadamard_ac_8x8_%1
paddw m2, m1
paddw m2, m2
ABS1 m4, m7
- pand m0, [mask_ac8 GLOBAL]
+ pand m0, [mask_ac8]
ABS1 m0, m7
paddw m2, m4
paddw m0, m2
@@ -2041,7 +2041,7 @@ cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
SSIM_ITER 3
; PHADDW m1, m2
; PHADDD m3, m4
- movdqa m7, [pw_1 GLOBAL]
+ movdqa m7, [pw_1]
pshufd m5, m3, 0xb1
pmaddwd m1, m7
pmaddwd m2, m7
@@ -2086,8 +2086,8 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3,7
paddd m1, m2
paddd m2, m3
paddd m3, m4
- movdqa m5, [ssim_c1 GLOBAL]
- movdqa m6, [ssim_c2 GLOBAL]
+ movdqa m5, [ssim_c1]
+ movdqa m6, [ssim_c2]
TRANSPOSE4x4D 0, 1, 2, 3, 4
; s1=m0, s2=m1, ss=m2, s12=m3
@@ -2117,10 +2117,10 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3,7
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
neg r2
%ifdef PIC
- lea r3, [mask_ff + 16 GLOBAL]
+ lea r3, [mask_ff + 16]
movdqu m1, [r3 + r2*4]
%else
- movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
+ movdqu m1, [mask_ff + r2*4 + 16]
%endif
pand m4, m1
.skip:
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index 808aa31..4d03f8f 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -99,7 +99,7 @@ SECTION .text
pavgb %2, %3
pxor %3, %5
mov%6 %1, %4
- pand %3, [pb_1 GLOBAL]
+ pand %3, [pb_1]
psubusb %2, %3
pavgb %1, %2
%endmacro
@@ -466,7 +466,7 @@ cglobal predict_8x8_dc_mmxext, 2,2
pxor mm1, mm1
psadbw mm0, [r1+7]
psadbw mm1, [r1+16]
- paddw mm0, [pw_8 GLOBAL]
+ paddw mm0, [pw_8]
paddw mm0, mm1
psrlw mm0, 4
pshufw mm0, mm0, 0
@@ -481,7 +481,7 @@ cglobal predict_8x8_dc_mmxext, 2,2
cglobal %1, 2,2
pxor mm0, mm0
psadbw mm0, [r1+%2]
- paddw mm0, [pw_4 GLOBAL]
+ paddw mm0, [pw_4]
psrlw mm0, 3
pshufw mm0, mm0, 0
packuswb mm0, mm0
@@ -643,7 +643,7 @@ cglobal predict_8x8_vr_core_mmxext, 2,2
cglobal predict_8x8c_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm1, mm2
- pmullw mm2, [pw_3210 GLOBAL]
+ pmullw mm2, [pw_3210]
psllw mm1, 2
paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
@@ -672,7 +672,7 @@ cglobal predict_16x16_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
- pmullw mm5, [pw_3210 GLOBAL]
+ pmullw mm5, [pw_3210]
psllw mm2, 3
psllw mm1, 2
movq mm3, mm2
@@ -786,7 +786,7 @@ cglobal predict_8x8_vl_sse2, 2,2
;-----------------------------------------------------------------------------
cglobal predict_8x8_vr_sse2, 2,2,7
movdqu xmm0, [r1+8]
- movdqa xmm6, [pw_ff00 GLOBAL]
+ movdqa xmm6, [pw_ff00]
add r0, 4*FDEC_STRIDE
movdqa xmm1, xmm0
movdqa xmm2, xmm0
@@ -910,7 +910,7 @@ cglobal predict_8x8_hu_%1, 2,2
add r0, 4*FDEC_STRIDE
%ifidn %1, ssse3
movq mm5, [r1+7]
- movq mm6, [pb_reverse GLOBAL]
+ movq mm6, [pb_reverse]
movq mm1, mm5
movq mm2, mm5
movq mm3, mm5
@@ -979,7 +979,7 @@ cglobal predict_8x8c_v_mmx, 1,1
%macro PRED_8x8C_H 1
cglobal predict_8x8c_h_%1, 1,1
%ifidn %1, ssse3
- mova m1, [pb_3 GLOBAL]
+ mova m1, [pb_3]
%endif
%assign n 0
%rep 8
@@ -1018,7 +1018,7 @@ cglobal predict_8x8c_dc_core_mmxext, 1,1
pshufw mm2, r2m, 0
%endif
psrlw mm0, 3
- paddw mm1, [pw_2 GLOBAL]
+ paddw mm1, [pw_2]
movq mm3, mm2
pshufw mm1, mm1, 0
pshufw mm0, mm0, 0 ; dc0 (w)
@@ -1065,7 +1065,7 @@ cglobal predict_8x8c_p_core_sse2, 1,1
punpcklqdq xmm0, xmm0
punpcklqdq xmm2, xmm2
punpcklqdq xmm4, xmm4
- pmullw xmm2, [pw_76543210 GLOBAL]
+ pmullw xmm2, [pw_76543210]
paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
movdqa xmm3, xmm0
paddsw xmm3, xmm4
@@ -1107,7 +1107,7 @@ cglobal predict_16x16_p_core_sse2, 1,2,8
punpcklqdq xmm1, xmm1
punpcklqdq xmm2, xmm2
movdqa xmm3, xmm1
- pmullw xmm3, [pw_76543210 GLOBAL]
+ pmullw xmm3, [pw_76543210]
psllw xmm1, 3
paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
@@ -1162,7 +1162,7 @@ cglobal predict_16x16_v_sse2, 1,1
cglobal predict_16x16_h_%1, 1,2
mov r1, FDEC_STRIDE*12
%ifidn %1, ssse3
- mova m1, [pb_3 GLOBAL]
+ mova m1, [pb_3]
%endif
.vloop:
%assign n 0
@@ -1214,7 +1214,7 @@ cglobal predict_16x16_dc_core_mmxext, 1,2
REP_RET
cglobal predict_16x16_dc_top_mmxext, 1,2
- PRED16x16_DC [pw_8 GLOBAL], 4
+ PRED16x16_DC [pw_8], 4
REP_RET
cglobal predict_16x16_dc_left_core_mmxext, 1,1
@@ -1247,7 +1247,7 @@ cglobal predict_16x16_dc_core_sse2, 1,1
RET
cglobal predict_16x16_dc_top_sse2, 1,1
- PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
+ PRED16x16_DC_SSE2 [pw_8], 4
RET
cglobal predict_16x16_dc_left_core_sse2, 1,1
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 52e121a..3edd244 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -86,7 +86,7 @@ SECTION .text
%endmacro
%macro QUANT_DC_START_SSSE3 0
- movdqa m5, [pb_01 GLOBAL]
+ movdqa m5, [pb_01]
movd m6, r1m ; mf
movd m7, r2m ; bias
pshufb m6, m5
@@ -361,7 +361,7 @@ cglobal x264_dequant_%2x%2_%1, 0,3
.rshift32:
neg t0d
movd m2, t0d
- mova m3, [pd_1 GLOBAL]
+ mova m3, [pd_1]
pxor m4, m4
pslld m3, m2
psrld m3, 1
@@ -381,10 +381,10 @@ cglobal x264_dequant_%2x%2_flat16_%1, 0,3
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %3
%ifdef PIC
- lea r1, [dequant%2_scale GLOBAL]
+ lea r1, [dequant%2_scale]
add r1, t2
%else
- lea r1, [dequant%2_scale + t2 GLOBAL]
+ lea r1, [dequant%2_scale + t2]
%endif
movifnidn r0, r0mp
movd m4, t0d
@@ -446,7 +446,7 @@ cglobal x264_dequant_4x4dc_%1, 0,3
.rshift32:
neg t0d
movd m3, t0d
- mova m4, [pw_1 GLOBAL]
+ mova m4, [pw_1]
mova m5, m4
pslld m4, m3
psrld m4, 1
@@ -588,15 +588,15 @@ cextern x264_decimate_table8
;This is not true for score64.
cglobal x264_decimate_score%1_%2, 1,3
%ifdef PIC
- lea r10, [x264_decimate_table4 GLOBAL]
- lea r11, [decimate_mask_table4 GLOBAL]
+ lea r10, [x264_decimate_table4]
+ lea r11, [decimate_mask_table4]
%define table r10
%define mask_table r11
%else
%define table x264_decimate_table4
%define mask_table decimate_mask_table4
%endif
- DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
+ DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
xor edx, 0xffff
je .ret
test eax, eax
@@ -640,12 +640,12 @@ DECIMATE4x4 16, ssse3
%ifdef ARCH_X86_64
cglobal x264_decimate_score64_%1, 1,4
%ifdef PIC
- lea r10, [x264_decimate_table8 GLOBAL]
+ lea r10, [x264_decimate_table8]
%define table r10
%else
%define table x264_decimate_table8
%endif
- mova m5, [pb_1 GLOBAL]
+ mova m5, [pb_1]
DECIMATE_MASK r1d, eax, r0, m5, %1, null
test eax, eax
jne .ret9
@@ -681,7 +681,7 @@ cglobal x264_decimate_score64_%1, 1,6
%else
cglobal x264_decimate_score64_%1, 1,5
%endif
- mova m7, [pb_1 GLOBAL]
+ mova m7, [pb_1]
DECIMATE_MASK r3, r2, r0, m7, %1, r5
test r2, r2
jne .ret9
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 342a984..6db8abf 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -351,7 +351,7 @@ cglobal x264_intra_sad_x3_8x8_mmxext, 3,3
psadbw m0, m7
psadbw m1, m6
paddw m0, m1
- paddw m0, [pw_8 GLOBAL]
+ paddw m0, [pw_8]
psrlw m0, 4
punpcklbw m0, m0
pshufw m0, m0, 0x0 ;DC prediction
@@ -411,7 +411,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
movq m6, [r1 - FDEC_STRIDE]
add r1, FDEC_STRIDE*4
%ifidn %1,ssse3
- movq m7, [pb_3 GLOBAL]
+ movq m7, [pb_3]
%endif
INTRA_SAD_HV_ITER 0, %1
INTRA_SAD_HV_ITER 2, %1
@@ -450,7 +450,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
%ifidn %1, ssse3
movq2dq xmm0, m0
- pshufb xmm0, [pb_shuf8x8c GLOBAL]
+ pshufb xmm0, [pb_shuf8x8c]
movq xmm1, [r0+FENC_STRIDE*0]
movq xmm2, [r0+FENC_STRIDE*1]
movq xmm3, [r0+FENC_STRIDE*2]
@@ -522,7 +522,7 @@ cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
paddw mm0, mm1
movd r3d, mm0
%ifidn %1, ssse3
- mova m1, [pb_3 GLOBAL]
+ mova m1, [pb_3]
%endif
%assign x 0
%rep 16
@@ -1301,10 +1301,10 @@ cglobal x264_pixel_sad_16x%2_cache64_%1
%endif
%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
%ifdef PIC
- lea r5, [sad_w16_addr GLOBAL]
+ lea r5, [sad_w16_addr]
add r5, r4
%else
- lea r5, [sad_w16_addr + r4 GLOBAL]
+ lea r5, [sad_w16_addr + r4]
%endif
and r2, ~15
mov r4d, %2/2
@@ -1323,7 +1323,7 @@ cglobal x264_pixel_sad_16x%2_cache64_%1
jle x264_pixel_sad_%1x%2_mmxext
and eax, 7
shl eax, 3
- movd mm6, [sw_64 GLOBAL]
+ movd mm6, [sw_64]
movd mm7, eax
psubw mm6, mm7
PROLOGUE 4,5
diff --git a/common/x86/util.h b/common/x86/util.h
index efc700a..c8bcf4b 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -45,8 +45,9 @@ static inline void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b,
#define x264_predictor_difference x264_predictor_difference_mmxext
static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
{
- int sum = 0;
- uint16_t output[4];
+ int sum;
+ static const uint64_t pw_1 = 0x0001000100010001ULL;
+
asm(
"pxor %%mm4, %%mm4 \n"
"test $1, %1 \n"
@@ -56,7 +57,7 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
"psubw %%mm3, %%mm0 \n"
"jmp 2f \n"
"3: \n"
- "sub $1, %1 \n"
+ "dec %1 \n"
"1: \n"
"movq -8(%2,%1,4), %%mm0 \n"
"psubw -4(%2,%1,4), %%mm0 \n"
@@ -67,11 +68,13 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
"pmaxsw %%mm2, %%mm0 \n"
"paddusw %%mm0, %%mm4 \n"
"jg 1b \n"
- "movq %%mm4, %0 \n"
- :"=m"(output), "+r"(i_mvc)
- :"r"(mvc), "m"(M64( mvc ))
+ "pmaddwd %4, %%mm4 \n"
+ "pshufw $14, %%mm4, %%mm0 \n"
+ "paddd %%mm0, %%mm4 \n"
+ "movd %%mm4, %0 \n"
+ :"=r"(sum), "+r"(i_mvc)
+ :"r"(mvc), "m"(M64( mvc )), "m"(pw_1)
);
- sum += output[0] + output[1] + output[2] + output[3];
return sum;
}
#define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 2a91084..ee3eca9 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -65,28 +65,16 @@
%endif
%endmacro
-; PIC support macros.
-; x86_64 can't fit 64bit address literals in most instruction types,
-; so shared objects (under the assumption that they might be anywhere
-; in memory) must use an address mode that does fit.
-; So all accesses to global variables must use this macro, e.g.
-; mov eax, [foo GLOBAL]
-; instead of
-; mov eax, [foo]
-;
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-
%ifdef WIN64
%define PIC
%elifndef ARCH_X86_64
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
%undef PIC
%endif
%ifdef PIC
- %define GLOBAL wrt rip
-%else
- %define GLOBAL
+ default rel
%endif
; Macros to eliminate most code duplication between x86_32 and x86_64:
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index b822688..d70bb0e 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -239,10 +239,10 @@
; %3/%4: source regs
; %5/%6: tmp regs
%ifidn %1, d
-%define mask [mask_10 GLOBAL]
+%define mask [mask_10]
%define shift 16
%elifidn %1, q
-%define mask [mask_1100 GLOBAL]
+%define mask [mask_1100]
%define shift 32
%endif
%if %0==6 ; less dependency if we have two tmp
diff --git a/configure b/configure
index b254383..25f5458 100755
--- a/configure
+++ b/configure
@@ -23,6 +23,7 @@ echo " --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS"
echo " --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS"
echo " --host=HOST build programs to run on HOST"
echo " --cross-prefix=PREFIX use PREFIX for compilation tools"
+echo " --sysroot=SYSROOT root of cross-build tree"
echo ""
exit 1
fi
@@ -223,6 +224,10 @@ for opt do
--cross-prefix=*)
cross_prefix="${opt#--cross-prefix=}"
;;
+ --sysroot=*)
+ CFLAGS="$CFLAGS --sysroot=${opt#--sysroot=}"
+ LDFLAGS="$LDFLAGS --sysroot=${opt#--sysroot=}"
+ ;;
*)
echo "Unknown option $opt, ignored"
;;
@@ -367,7 +372,17 @@ case $host_cpu in
;;
arm*)
ARCH="ARM"
- AS="${AS-${cross_prefix}gcc}"
+ if [ "$SYS" = MACOSX ] ; then
+ AS="${AS-extras/gas-preprocessor.pl $CC}"
+ ASFLAGS="$ASFLAGS -DPREFIX -DPIC" # apple's ld doesn't support movw/movt relocations at all
+ # build for armv7 by default
+ if ! echo $CFLAGS | grep -Eq '\-arch' ; then
+ CFLAGS="$CFLAGS -arch armv7"
+ LDFLAGS="$LDFLAGS -arch armv7"
+ fi
+ else
+ AS="${AS-${cross_prefix}gcc}"
+ fi
;;
s390|s390x)
ARCH="S390"
@@ -427,10 +442,10 @@ if [ $asm = yes -a $ARCH = ARM ] ; then
# set flags so neon is built by default
echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu|-mfloat-abi)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp"
- if cc_check '' '' '__asm__("rev ip, ip");' ; then define HAVE_ARMV6 && ASFLAGS="$ASFLAGS -DHAVE_ARMV6"
- cc_check '' '' '__asm__("movt r0, #0");' && define HAVE_ARMV6T2 && ASFLAGS="$ASFLAGS -DHAVE_ARMV6T2"
- cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON && ASFLAGS="$ASFLAGS -DHAVE_NEON"
- ASFLAGS="$ASFLAGS -c"
+ if cc_check '' '' '__asm__("rev ip, ip");' ; then define HAVE_ARMV6
+ cc_check '' '' '__asm__("movt r0, #0");' && define HAVE_ARMV6T2
+ cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON
+ ASFLAGS="$ASFLAGS $CFLAGS -c"
else
echo "You specified a pre-ARMv6 or Thumb-1 CPU in your CFLAGS."
echo "If you really want to run on such a CPU, configure with --disable-asm."
diff --git a/doc/standards.txt b/doc/standards.txt
index db9a691..7474d8f 100644
--- a/doc/standards.txt
+++ b/doc/standards.txt
@@ -4,6 +4,7 @@ checkasm is written in gcc, with no attempt at compatibility with anything else.
We make the following additional assumptions which are true of real systems but not guaranteed by C99:
* Two's complement.
* Signed right-shifts are sign-extended.
+* int is 32-bit or larger.
x86-specific assumptions:
* The stack is 16-byte aligned. We align it on entry to libx264 and on entry to any thread, but the compiler must preserve alignment after that.
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 666596b..1d48b7d 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -40,6 +40,7 @@ typedef struct
int i_ref;
int i_rd16x16;
x264_me_t me16x16;
+ x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
/* 8x8 */
int i_cost8x8;
@@ -361,8 +362,12 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
h->mb.i_me_method = h->param.analyse.i_me_method;
h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
+ if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
+ h->mb.i_subpel_refine--;
h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
&& h->mb.i_subpel_refine >= 5;
+ h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
+ (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
h->mb.b_transform_8x8 = 0;
h->mb.b_noise_reduction = 0;
@@ -1722,20 +1727,45 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
a->l1.me16x16.i_ref = a->l1.i_ref;
/* get cost of BI mode */
+ int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
+ h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
+ h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
src0 = h->mc.get_ref( pix0, &stride0,
h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
- a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
+ a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
src1 = h->mc.get_ref( pix1, &stride1,
h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
- a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
+ a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
- + REF_COST( 0, a->l0.i_ref )
- + REF_COST( 1, a->l1.i_ref )
- + a->l0.me16x16.cost_mv
- + a->l1.me16x16.cost_mv;
+ + ref_costs
+ + a->l0.bi16x16.cost_mv
+ + a->l1.bi16x16.cost_mv;
+
+
+ /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
+ if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
+ {
+ int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
+ + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
+ int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
+ + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
+ h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
+ h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
+ h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
+ int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
+ + ref_costs + l0_mv_cost + l1_mv_cost;
+ if( cost00 < a->i_cost16x16bi )
+ {
+ M32( a->l0.bi16x16.mv ) = 0;
+ M32( a->l1.bi16x16.mv ) = 0;
+ a->l0.bi16x16.cost_mv = l0_mv_cost;
+ a->l1.bi16x16.cost_mv = l1_mv_cost;
+ a->i_cost16x16bi = cost00;
+ }
+ }
/* mb type cost */
a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
@@ -2205,7 +2235,7 @@ static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
{
case D_16x16:
if( h->mb.i_type == B_BI_BI )
- x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
+ x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
break;
case D_16x8:
for( i=0; i<2; i++ )
@@ -2277,9 +2307,10 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
int last_qp_tried = 0;
origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
+ int origcbp = h->mb.cbp[h->mb.i_mb_xy];
/* If CBP is already zero, don't raise the quantizer any higher. */
- for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
+ for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
{
/* Without psy-RD, require monotonicity when moving quant away from previous
* macroblock's quant; allow 1 failure when moving quant towards previous quant.
@@ -2294,14 +2325,47 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
h->mb.i_qp = orig_qp;
failures = 0;
prevcost = origcost;
+
+ /* If the current QP results in an empty CBP, it's highly likely that lower QPs
+ * (up to a point) will too. So, jump down to where the threshold will kick in
+ * and check the QP there. If the CBP is still empty, skip the main loop.
+ * If it isn't empty, we would have ended up having to check this QP anyways,
+ * so as long as we store it for later lookup, we lose nothing. */
+ int already_checked_qp = -1;
+ int already_checked_cost = COST_MAX;
+ if( direction == -1 )
+ {
+ if( !origcbp )
+ {
+ h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
+ h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
+ already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
+ if( !h->mb.cbp[h->mb.i_mb_xy] )
+ {
+ /* If our empty-CBP block is lower QP than the last QP,
+ * the last QP almost surely doesn't have a CBP either. */
+ if( h->mb.i_last_qp > h->mb.i_qp )
+ last_qp_tried = 1;
+ break;
+ }
+ already_checked_qp = h->mb.i_qp;
+ h->mb.i_qp = orig_qp;
+ }
+ }
+
h->mb.i_qp += direction;
while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
{
if( h->mb.i_last_qp == h->mb.i_qp )
last_qp_tried = 1;
- h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
- cost = x264_rd_cost_mb( h, a->i_lambda2 );
- COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
+ if( h->mb.i_qp == already_checked_qp )
+ cost = already_checked_cost;
+ else
+ {
+ h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
+ cost = x264_rd_cost_mb( h, a->i_lambda2 );
+ COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
+ }
/* We can't assume that the costs are monotonic over QPs.
* Tie case-as-failure seems to give better results. */
@@ -2819,8 +2883,8 @@ intra_analysis:
}
else if( i_type == B_BI_BI )
{
- x264_me_refine_qpel( h, &analysis.l0.me16x16 );
- x264_me_refine_qpel( h, &analysis.l1.me16x16 );
+ x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
+ x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
}
}
else if( i_partition == D_16x8 )
@@ -2938,7 +3002,7 @@ intra_analysis:
x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
}
else if( i_type == B_BI_BI )
- x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
+ x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
}
else if( i_partition == D_16x8 )
{
@@ -3121,10 +3185,10 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
break;
case B_BI_BI:
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
- x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
+ x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
- x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
+ x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
break;
}
break;
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index c65c9bd..85d2dde 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -147,10 +147,9 @@ static int block_residual_write_cavlc( x264_t *h, int i_ctxBlockCat, int16_t *l,
if( i_trailing < i_total )
{
- int16_t val = runlevel.level[i_trailing];
- int16_t val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
- if( i_trailing < 3 )
- val -= (val>>15)|1; /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
+ int val = runlevel.level[i_trailing];
+ int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
+ val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
val += LEVEL_TABLE_SIZE/2;
if( (unsigned)val_original < LEVEL_TABLE_SIZE )
diff --git a/encoder/encoder.c b/encoder/encoder.c
index d873cd0..df62389 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -84,7 +84,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
x264_param_t *param = &h->param;
int i;
- /* First we fill all field */
+ /* First we fill all fields */
sh->sps = sps;
sh->pps = pps;
@@ -108,12 +108,24 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
sh->i_redundant_pic_cnt = 0;
- if( !h->mb.b_direct_auto_read )
+ h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
+ && h->param.i_bframe
+ && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
+
+ if( !h->mb.b_direct_auto_read && sh->i_type == SLICE_TYPE_B )
{
- if( h->mb.b_direct_auto_write )
- sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] );
+ if( h->fref1[0]->i_poc_l0ref0 == h->fref0[0]->i_poc )
+ {
+ if( h->mb.b_direct_auto_write )
+ sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] );
+ else
+ sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
+ }
else
- sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
+ {
+ h->mb.b_direct_auto_write = 0;
+ sh->b_direct_spatial_mv_pred = 1;
+ }
}
/* else b_direct_spatial_mv_pred was read from the 2pass statsfile */
@@ -430,11 +442,6 @@ static int x264_validate_parameters( x264_t *h )
x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" );
h->param.analyse.i_me_method = X264_ME_UMH;
}
- if( h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL )
- {
- x264_log( h, X264_LOG_WARNING, "interlace + direct=temporal is not implemented\n" );
- h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
- }
if( h->param.analyse.i_weighted_pred > 0 )
{
x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" );
@@ -507,6 +514,39 @@ static int x264_validate_parameters( x264_t *h )
}
h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 );
h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
+ if( h->param.rc.i_vbv_buffer_size )
+ {
+ if( h->param.rc.i_rc_method == X264_RC_CQP )
+ {
+ x264_log( h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n" );
+ h->param.rc.i_vbv_max_bitrate = 0;
+ h->param.rc.i_vbv_buffer_size = 0;
+ }
+ else if( h->param.rc.i_vbv_max_bitrate == 0 )
+ {
+ if( h->param.rc.i_rc_method == X264_RC_ABR )
+ {
+ x264_log( h, X264_LOG_WARNING, "VBV maxrate unspecified, assuming CBR\n" );
+ h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+ }
+ else
+ {
+ x264_log( h, X264_LOG_WARNING, "VBV bufsize set but maxrate unspecified, ignored\n" );
+ h->param.rc.i_vbv_buffer_size = 0;
+ }
+ }
+ else if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
+ h->param.rc.i_rc_method == X264_RC_ABR )
+ {
+ x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" );
+ h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+ }
+ }
+ else if( h->param.rc.i_vbv_max_bitrate )
+ {
+ x264_log( h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize, ignored\n" );
+ h->param.rc.i_vbv_max_bitrate = 0;
+ }
int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
if( h->param.b_sliced_threads )
@@ -566,8 +606,6 @@ static int x264_validate_parameters( x264_t *h )
x264_log( h, X264_LOG_WARNING, "ref > 1 + intra-refresh is not supported\n" );
h->param.i_frame_reference = 1;
}
- if( h->param.b_intra_refresh )
- h->param.i_keyint_max = X264_MIN( h->param.i_keyint_max, (h->param.i_width+15)/16 - 1 );
h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
h->param.rc.i_lookahead = x264_clip3( h->param.rc.i_lookahead, 0, X264_LOOKAHEAD_MAX );
{
@@ -597,10 +635,6 @@ static int x264_validate_parameters( x264_t *h )
h->param.i_sync_lookahead = 0;
#endif
- h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
- && h->param.i_bframe
- && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
-
h->param.i_deblocking_filter_alphac0 = x264_clip3( h->param.i_deblocking_filter_alphac0, -6, 6 );
h->param.i_deblocking_filter_beta = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 );
h->param.analyse.i_luma_deadzone[0] = x264_clip3( h->param.analyse.i_luma_deadzone[0], 0, 32 );
@@ -659,8 +693,6 @@ static int x264_validate_parameters( x264_t *h )
/* Psy trellis has a similar effect. */
if( h->mb.i_psy_trellis )
h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2;
- else
- h->mb.i_psy_trellis = 0;
h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
@@ -1071,7 +1103,7 @@ fail:
****************************************************************************/
int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
{
- h = h->thread[h->i_thread_phase];
+ h = h->thread[h->thread[0]->i_thread_phase];
x264_set_aspect_ratio( h, param, 0 );
#define COPY(var) h->param.var = param->var
COPY( i_frame_reference ); // but never uses more refs than initially specified
@@ -1110,11 +1142,30 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
COPY( i_slice_max_size );
COPY( i_slice_max_mbs );
COPY( i_slice_count );
+ /* VBV can't be turned on if it wasn't on to begin with */
+ if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 &&
+ param->rc.i_vbv_max_bitrate > 0 && param->rc.i_vbv_buffer_size > 0 )
+ {
+ COPY( rc.i_vbv_max_bitrate );
+ COPY( rc.i_vbv_buffer_size );
+ COPY( rc.i_bitrate );
+ }
+ COPY( rc.f_rf_constant );
#undef COPY
mbcmp_init( h );
- return x264_validate_parameters( h );
+ int ret = x264_validate_parameters( h );
+
+ /* Supported reconfiguration options (1-pass only):
+ * vbv-maxrate
+ * vbv-bufsize
+ * crf
+ * bitrate (CBR only) */
+ if( !ret )
+ x264_ratecontrol_init_reconfigurable( h, 0 );
+
+ return ret;
}
/****************************************************************************
@@ -2010,6 +2061,8 @@ static int x264_threaded_slices_write( x264_t *h )
for( i = 0; i <= h->sps->i_mb_height; i++ )
x264_fdec_filter_row( h, i );
+ x264_threads_merge_ratecontrol( h );
+
for( i = 1; i < h->param.i_threads; i++ )
{
x264_t *t = h->thread[i];
@@ -2025,8 +2078,6 @@ static int x264_threaded_slices_write( x264_t *h )
((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
}
- x264_threads_merge_ratecontrol( h );
-
return 0;
}
@@ -2255,22 +2306,22 @@ int x264_encoder_encode( x264_t *h,
if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P )
{
int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
- float increment = ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max;
+ float increment = X264_MAX( ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max, 1 );
+ int max_position = (int)(increment * h->param.i_keyint_max);
if( IS_X264_TYPE_I( h->fref0[0]->i_type ) )
h->fdec->f_pir_position = 0;
else
{
- if( h->fref0[0]->i_pir_end_col == h->sps->i_mb_width - 1 )
+ h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
+ if( h->fdec->f_pir_position+0.5 >= max_position )
{
h->fdec->f_pir_position = 0;
h->fenc->b_keyframe = 1;
}
- else
- h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
}
h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
h->fdec->f_pir_position += increment * pocdiff;
- h->fdec->i_pir_end_col = X264_MIN( h->fdec->f_pir_position+0.5, h->sps->i_mb_width-1 );
+ h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5;
}
/* Write SPS and PPS */
@@ -2306,8 +2357,9 @@ int x264_encoder_encode( x264_t *h,
if( h->fenc->i_type != X264_TYPE_IDR )
{
+ int time_to_recovery = X264_MIN( h->sps->i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe;
x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
- x264_sei_recovery_point_write( h, &h->out.bs, h->param.i_keyint_max );
+ x264_sei_recovery_point_write( h, &h->out.bs, time_to_recovery );
x264_nal_end( h );
overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
}
@@ -2327,6 +2379,9 @@ int x264_encoder_encode( x264_t *h,
x264_reference_check_reorder( h );
}
+ if( h->i_ref0 )
+ h->fdec->i_poc_l0ref0 = h->fref0[0]->i_poc;
+
if( h->sh.i_type == SLICE_TYPE_B )
x264_macroblock_bipred_init( h );
@@ -2762,7 +2817,8 @@ void x264_encoder_close ( x264_t *h )
x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%%%s\n", 100. * i_i8x8 / i_intra, buf );
}
- if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
+ if( (h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO ||
+ (h->stat.i_direct_frames[0] && h->stat.i_direct_frames[1]))
&& h->stat.i_frame_count[SLICE_TYPE_B] )
{
x264_log( h, X264_LOG_INFO, "direct mvs spatial:%.1f%% temporal:%.1f%%\n",
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index e4edb8a..f67a898 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -42,30 +42,24 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[4] )
int d1 = dct[2] + dct[3]; \
int d2 = dct[0] - dct[1]; \
int d3 = dct[2] - dct[3]; \
- int dmf = dequant_mf[i_qp%6][0]; \
- int qbits = i_qp/6 - 5; \
- if( qbits > 0 ) \
- { \
- dmf <<= qbits; \
- qbits = 0; \
- }
+ int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
static inline void idct_dequant_2x2_dc( int16_t dct[4], int16_t dct4x4[4][16], int dequant_mf[6][16], int i_qp )
{
IDCT_DEQUANT_START
- dct4x4[0][0] = (d0 + d1) * dmf >> -qbits;
- dct4x4[1][0] = (d0 - d1) * dmf >> -qbits;
- dct4x4[2][0] = (d2 + d3) * dmf >> -qbits;
- dct4x4[3][0] = (d2 - d3) * dmf >> -qbits;
+ dct4x4[0][0] = (d0 + d1) * dmf >> 5;
+ dct4x4[1][0] = (d0 - d1) * dmf >> 5;
+ dct4x4[2][0] = (d2 + d3) * dmf >> 5;
+ dct4x4[3][0] = (d2 - d3) * dmf >> 5;
}
static inline void idct_dequant_2x2_dconly( int16_t out[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
{
IDCT_DEQUANT_START
- out[0] = (d0 + d1) * dmf >> -qbits;
- out[1] = (d0 - d1) * dmf >> -qbits;
- out[2] = (d2 + d3) * dmf >> -qbits;
- out[3] = (d2 - d3) * dmf >> -qbits;
+ out[0] = (d0 + d1) * dmf >> 5;
+ out[1] = (d0 - d1) * dmf >> 5;
+ out[2] = (d2 + d3) * dmf >> 5;
+ out[3] = (d2 - d3) * dmf >> 5;
}
static inline void dct2x2dc( int16_t d[4], int16_t dct4x4[4][16] )
@@ -208,8 +202,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[16] );
int i, nz;
- int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P);
- int decimate_score = b_decimate ? 0 : 9;
+ int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
if( h->mb.b_lossless )
{
@@ -342,7 +335,7 @@ static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp,
void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
{
int i, ch, nz, nz_dc;
- int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
+ int b_decimate = b_inter && h->mb.b_dct_decimate;
ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
h->mb.i_cbp_chroma = 0;
@@ -607,7 +600,7 @@ void x264_macroblock_encode( x264_t *h )
{
int i_cbp_dc = 0;
int i_qp = h->mb.i_qp;
- int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
+ int b_decimate = h->mb.b_dct_decimate;
int b_force_no_skip = 0;
int i,idx,nz;
h->mb.i_cbp_luma = 0;
@@ -914,8 +907,7 @@ void x264_macroblock_encode( x264_t *h )
/*****************************************************************************
* x264_macroblock_probe_skip:
- * Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
- * the previous QP
+ * Check if the current MB could be encoded as a [PB]_SKIP
*****************************************************************************/
int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
{
@@ -1052,7 +1044,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
int i_qp = h->mb.i_qp;
uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
- int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
+ int b_decimate = h->mb.b_dct_decimate;
int nnz8x8 = 0;
int ch, nz;
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 63b3be6..8c61582 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -134,9 +134,11 @@ struct x264_ratecontrol_t
* This value is the current position (0 or 1). */
/* MBRC stuff */
- double frame_size_estimated;
+ float frame_size_estimated; /* Access to this variable must be atomic: double is
+ * not atomic on all arches we care about */
double frame_size_planned;
double slice_size_planned;
+ double max_frame_error;
predictor_t (*row_pred)[2];
predictor_t row_preds[5][2];
predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
@@ -388,6 +390,53 @@ static char *x264_strcat_filename( char *input, char *suffix )
return output;
}
+void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init )
+{
+ x264_ratecontrol_t *rc = h->rc;
+ if( !b_init && rc->b_2pass )
+ return;
+
+ if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 )
+ {
+ if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
+ {
+ h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
+ x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
+ h->param.rc.i_vbv_buffer_size );
+ }
+
+ /* We don't support changing the ABR bitrate right now,
+ so if the stream starts as CBR, keep it CBR. */
+ if( rc->b_vbv_min_rate )
+ h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+ rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
+ rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
+ rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
+ rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
+ * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
+ if( b_init )
+ {
+ if( h->param.rc.f_vbv_buffer_init > 1. )
+ h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
+ h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
+ rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
+ rc->b_vbv = 1;
+ rc->b_vbv_min_rate = !rc->b_2pass
+ && h->param.rc.i_rc_method == X264_RC_ABR
+ && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
+ }
+ }
+ if( h->param.rc.i_rc_method == X264_RC_CRF )
+ {
+ /* Arbitrary rescaling to make CRF somewhat similar to QP.
+ * Try to compensate for MB-tree's effects as well. */
+ double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
+ double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
+ rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
+ / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
+ }
+}
+
int x264_ratecontrol_new( x264_t *h )
{
x264_ratecontrol_t *rc;
@@ -426,60 +475,10 @@ int x264_ratecontrol_new( x264_t *h )
x264_log(h, X264_LOG_ERROR, "constant rate-factor is incompatible with 2pass.\n");
return -1;
}
- if( h->param.rc.i_vbv_buffer_size )
- {
- if( h->param.rc.i_rc_method == X264_RC_CQP )
- {
- x264_log(h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n");
- h->param.rc.i_vbv_max_bitrate = 0;
- h->param.rc.i_vbv_buffer_size = 0;
- }
- else if( h->param.rc.i_vbv_max_bitrate == 0 )
- {
- if( h->param.rc.i_rc_method == X264_RC_ABR )
- {
- x264_log( h, X264_LOG_INFO, "VBV maxrate unspecified, assuming CBR\n" );
- h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
- }
- else
- {
- x264_log( h, X264_LOG_INFO, "VBV bufsize set but maxrate unspecified, ignored\n" );
- h->param.rc.i_vbv_buffer_size = 0;
- }
- }
- }
- if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
- h->param.rc.i_vbv_max_bitrate > 0)
- x264_log(h, X264_LOG_WARNING, "max bitrate less than average bitrate, ignored.\n");
- else if( h->param.rc.i_vbv_max_bitrate > 0 &&
- h->param.rc.i_vbv_buffer_size > 0 )
- {
- if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
- {
- h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
- x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
- h->param.rc.i_vbv_buffer_size );
- }
- if( h->param.rc.f_vbv_buffer_init > 1. )
- h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
- rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
- rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
- rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
- h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
- rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
- rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
- * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
- rc->b_vbv = 1;
- rc->b_vbv_min_rate = !rc->b_2pass
- && h->param.rc.i_rc_method == X264_RC_ABR
- && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate;
- }
- else if( h->param.rc.i_vbv_max_bitrate )
- {
- x264_log(h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize.\n");
- h->param.rc.i_vbv_max_bitrate = 0;
- }
- if(rc->rate_tolerance < 0.01)
+
+ x264_ratecontrol_init_reconfigurable( h, 1 );
+
+ if( rc->rate_tolerance < 0.01 )
{
x264_log(h, X264_LOG_WARNING, "bitrate tolerance too small, using .01\n");
rc->rate_tolerance = 0.01;
@@ -499,16 +498,6 @@ int x264_ratecontrol_new( x264_t *h )
rc->last_non_b_pict_type = SLICE_TYPE_I;
}
- if( h->param.rc.i_rc_method == X264_RC_CRF )
- {
- /* Arbitrary rescaling to make CRF somewhat similar to QP.
- * Try to compensate for MB-tree's effects as well. */
- double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
- double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
- rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
- / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
- }
-
rc->ip_offset = 6.0 * log(h->param.rc.f_ip_factor) / log(2.0);
rc->pb_offset = 6.0 * log(h->param.rc.f_pb_factor) / log(2.0);
rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
@@ -518,17 +507,21 @@ int x264_ratecontrol_new( x264_t *h )
rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
rc->last_qscale = qp2qscale(26);
- CHECKED_MALLOC( rc->pred, 5*sizeof(predictor_t) );
+ int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1;
+ CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds );
CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
for( i = 0; i < 5; i++ )
{
rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
rc->lmin[i] = qp2qscale( h->param.rc.i_qp_min );
rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max );
- rc->pred[i].coeff= 2.0;
- rc->pred[i].count= 1.0;
- rc->pred[i].decay= 0.5;
- rc->pred[i].offset= 0.0;
+ for( j = 0; j < num_preds; j++ )
+ {
+ rc->pred[i+j*5].coeff= 2.0;
+ rc->pred[i+j*5].count= 1.0;
+ rc->pred[i+j*5].decay= 0.5;
+ rc->pred[i+j*5].offset= 0.0;
+ }
for( j = 0; j < 2; j++ )
{
rc->row_preds[i][j].coeff= .25;
@@ -999,22 +992,6 @@ void x264_ratecontrol_delete( x264_t *h )
x264_free( rc );
}
-void x264_ratecontrol_set_estimated_size( x264_t *h, int bits )
-{
- x264_pthread_mutex_lock( &h->fenc->mutex );
- h->rc->frame_size_estimated = bits;
- x264_pthread_mutex_unlock( &h->fenc->mutex );
-}
-
-int x264_ratecontrol_get_estimated_size( x264_t const *h)
-{
- int size;
- x264_pthread_mutex_lock( &h->fenc->mutex );
- size = h->rc->frame_size_estimated;
- x264_pthread_mutex_unlock( &h->fenc->mutex );
- return size;
-}
-
static void accum_p_qp_update( x264_t *h, float qp )
{
x264_ratecontrol_t *rc = h->rc;
@@ -1186,6 +1163,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
/* tweak quality based on difference from predicted size */
if( y < h->i_threadslice_end-1 )
{
+ int i;
int prev_row_qp = h->fdec->i_row_qp[y];
int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
@@ -1199,19 +1177,23 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
- float size_of_other_slices = rc->frame_size_planned - slice_size_planned;
+ float size_of_other_slices = 0;
+ if( h->param.b_sliced_threads )
+ {
+ for( i = 0; i < h->param.i_threads; i++ )
+ if( h != h->thread[i] )
+ size_of_other_slices += h->thread[i]->rc->frame_size_estimated;
+ }
+ else
+ rc->max_frame_error = X264_MAX( 0.05, 1.0 / (h->sps->i_mb_width) );
+
/* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
- float max_frame_error = X264_MAX( 0.05, 1.0 / h->sps->i_mb_height );
- int b1 = predict_row_size_sum( h, y, rc->qpm );
-
- /* Assume that if this slice has become larger than expected,
- * the other slices will have gotten equally larger. */
- b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+ int b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
/* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
/* area at the top of the frame was measured inaccurately. */
- if( row_bits_so_far(h,y) < 0.05 * (rc->frame_size_planned-size_of_other_slices) )
+ if( row_bits_so_far( h, y ) < 0.05 * slice_size_planned )
return;
if( h->sh.i_type != SLICE_TYPE_I )
@@ -1226,8 +1208,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
(b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) )
{
rc->qpm ++;
- b1 = predict_row_size_sum( h, y, rc->qpm );
- b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+ b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
}
while( rc->qpm > i_qp_min
@@ -1236,20 +1217,18 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
|| b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) )
{
rc->qpm --;
- b1 = predict_row_size_sum( h, y, rc->qpm );
- b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+ b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
}
/* avoid VBV underflow */
while( (rc->qpm < h->param.rc.i_qp_max)
- && (rc->buffer_fill - b1 < rc->buffer_rate * max_frame_error) )
+ && (rc->buffer_fill - b1 < rc->buffer_rate * rc->max_frame_error) )
{
rc->qpm ++;
- b1 = predict_row_size_sum( h, y, rc->qpm );
- b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+ b1 = predict_row_size_sum( h, y, rc->qpm ) + size_of_other_slices;
}
- x264_ratecontrol_set_estimated_size(h, b1);
+ h->rc->frame_size_estimated = predict_row_size_sum( h, y, rc->qpm );
}
/* loses the fractional part of the frame-wise qp */
@@ -1293,6 +1272,7 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
h->thread[i]->param.rc.b_stat_read = 0;
h->thread[i]->param.i_bframe_adaptive = 0;
h->thread[i]->param.i_scenecut_threshold = 0;
+ h->thread[i]->param.rc.b_mb_tree = 0;
if( h->thread[i]->param.i_bframe > 1 )
h->thread[i]->param.i_bframe = 1;
}
@@ -1577,15 +1557,15 @@ static void update_vbv( x264_t *h, int bits )
if( rct->buffer_fill_final < 0 )
x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, rct->buffer_fill_final );
rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 );
- rct->buffer_fill_final += rct->buffer_rate;
- rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rct->buffer_size );
+ rct->buffer_fill_final += rcc->buffer_rate;
+ rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rcc->buffer_size );
}
// provisionally update VBV according to the planned size of all frames currently in progress
static void update_vbv_plan( x264_t *h, int overhead )
{
x264_ratecontrol_t *rcc = h->rc;
- rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final - overhead;
+ rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final;
if( h->i_thread_frames > 1 )
{
int j = h->rc - h->thread[0]->rc;
@@ -1596,13 +1576,15 @@ static void update_vbv_plan( x264_t *h, int overhead )
double bits = t->rc->frame_size_planned;
if( !t->b_thread_active )
continue;
- bits = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
+ bits = X264_MAX(bits, t->rc->frame_size_estimated);
rcc->buffer_fill -= bits;
rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 );
rcc->buffer_fill += rcc->buffer_rate;
rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
}
}
+ rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
+ rcc->buffer_fill -= overhead;
}
// apply VBV constraints and clip qscale to between lmin and lmax
@@ -1793,7 +1775,7 @@ static float rate_estimate_qscale( x264_t *h )
rcc->frame_size_planned = qscale2bits( &rce, q );
else
rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref1[h->i_ref1-1]->i_satd );
- x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
+ h->rc->frame_size_estimated = rcc->frame_size_planned;
/* For row SATDs */
if( rcc->b_vbv )
@@ -1802,13 +1784,15 @@ static float rate_estimate_qscale( x264_t *h )
}
else
{
- double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate * h->i_thread_frames;
+ double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate;
if( rcc->b_2pass )
{
- //FIXME adjust abr_buffer based on distance to the end of the video
int64_t diff;
int64_t predicted_bits = total_bits;
+ /* Adjust ABR buffer based on distance to the end of the video. */
+ if( rcc->num_entries > h->fenc->i_frame )
+ abr_buffer *= 0.5 * sqrt( rcc->num_entries - h->fenc->i_frame );
if( rcc->b_vbv )
{
@@ -1822,7 +1806,7 @@ static float rate_estimate_qscale( x264_t *h )
double bits = t->rc->frame_size_planned;
if( !t->b_thread_active )
continue;
- bits = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
+ bits = X264_MAX(bits, t->rc->frame_size_estimated);
predicted_bits += (int64_t)bits;
}
}
@@ -1963,61 +1947,96 @@ static float rate_estimate_qscale( x264_t *h )
/* Always use up the whole VBV in this case. */
if( rcc->single_frame_vbv )
rcc->frame_size_planned = rcc->buffer_rate;
- x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
+ h->rc->frame_size_estimated = rcc->frame_size_planned;
return q;
}
}
+void x264_threads_normalize_predictors( x264_t *h )
+{
+ int i;
+ double totalsize = 0;
+ for( i = 0; i < h->param.i_threads; i++ )
+ totalsize += h->thread[i]->rc->slice_size_planned;
+ double factor = h->rc->frame_size_planned / totalsize;
+ for( i = 0; i < h->param.i_threads; i++ )
+ h->thread[i]->rc->slice_size_planned *= factor;
+}
+
void x264_threads_distribute_ratecontrol( x264_t *h )
{
- int i, row, totalsize = 0;
- if( h->rc->b_vbv )
- for( row = 0; row < h->sps->i_mb_height; row++ )
- totalsize += h->fdec->i_row_satd[row];
+ int i, row;
+ x264_ratecontrol_t *rc = h->rc;
+
+ /* Initialize row predictors */
+ if( h->i_frame == 0 )
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ x264_ratecontrol_t *t = h->thread[i]->rc;
+ memcpy( t->row_preds, rc->row_preds, sizeof(rc->row_preds) );
+ }
+
for( i = 0; i < h->param.i_threads; i++ )
{
x264_t *t = h->thread[i];
- x264_ratecontrol_t *rc = h->rc;
- memcpy( t->rc, rc, sizeof(x264_ratecontrol_t) );
+ memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) );
+ t->rc->row_pred = &t->rc->row_preds[h->sh.i_type];
/* Calculate the planned slice size. */
- if( h->rc->b_vbv && rc->frame_size_planned )
+ if( rc->b_vbv && rc->frame_size_planned )
{
int size = 0;
for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
size += h->fdec->i_row_satd[row];
- t->rc->slice_size_planned = size * rc->frame_size_planned / totalsize;
+ t->rc->slice_size_planned = predict_size( &rc->pred[h->sh.i_type + (i+1)*5], rc->qpm, size );
}
else
t->rc->slice_size_planned = 0;
}
+ if( rc->b_vbv && rc->frame_size_planned )
+ {
+ x264_threads_normalize_predictors( h );
+
+ if( rc->single_frame_vbv )
+ {
+ /* Compensate for our max frame error threshold: give more bits (proportionally) to smaller slices. */
+ for( i = 0; i < h->param.i_threads; i++ )
+ {
+ x264_t *t = h->thread[i];
+ t->rc->max_frame_error = X264_MAX( 0.05, 1.0 / (t->i_threadslice_end - t->i_threadslice_start) );
+ t->rc->slice_size_planned += 2 * t->rc->max_frame_error * rc->frame_size_planned;
+ }
+ x264_threads_normalize_predictors( h );
+ }
+
+ for( i = 0; i < h->param.i_threads; i++ )
+ h->thread[i]->rc->frame_size_estimated = h->thread[i]->rc->slice_size_planned;
+ }
}
void x264_threads_merge_ratecontrol( x264_t *h )
{
- int i, j, k;
+ int i, row;
x264_ratecontrol_t *rc = h->rc;
x264_emms();
- for( i = 1; i < h->param.i_threads; i++ )
+ for( i = 0; i < h->param.i_threads; i++ )
{
- x264_ratecontrol_t *t = h->thread[i]->rc;
- rc->qpa_rc += t->qpa_rc;
- rc->qpa_aq += t->qpa_aq;
- for( j = 0; j < 5; j++ )
- for( k = 0; k < 2; k++ )
- {
- rc->row_preds[j][k].coeff += t->row_preds[j][k].coeff;
- rc->row_preds[j][k].offset += t->row_preds[j][k].offset;
- rc->row_preds[j][k].count += t->row_preds[j][k].count;
- }
- }
- for( j = 0; j < 5; j++ )
- for( k = 0; k < 2; k++ )
+ x264_t *t = h->thread[i];
+ x264_ratecontrol_t *rct = h->thread[i]->rc;
+ if( h->param.rc.i_vbv_buffer_size )
{
- rc->row_preds[j][k].coeff /= h->param.i_threads;
- rc->row_preds[j][k].offset /= h->param.i_threads;
- rc->row_preds[j][k].count /= h->param.i_threads;
+ int size = 0;
+ for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
+ size += h->fdec->i_row_satd[row];
+ int bits = t->stat.frame.i_mv_bits + t->stat.frame.i_tex_bits + t->stat.frame.i_misc_bits;
+ int mb_count = (t->i_threadslice_end - t->i_threadslice_start) * h->sps->i_mb_width;
+ update_predictor( &rc->pred[h->sh.i_type+5*i], qp2qscale(rct->qpa_rc/mb_count), size, bits );
}
+ if( !i )
+ continue;
+ rc->qpa_rc += rct->qpa_rc;
+ rc->qpa_aq += rct->qpa_aq;
+ }
}
void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
@@ -2027,8 +2046,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
#define COPY(var) memcpy(&cur->rc->var, &prev->rc->var, sizeof(cur->rc->var))
/* these vars are updated in x264_ratecontrol_start()
* so copy them from the context that most recently started (prev)
- * to the context that's about to start (cur).
- */
+ * to the context that's about to start (cur). */
COPY(accum_p_qp);
COPY(accum_p_norm);
COPY(last_satd);
@@ -2040,6 +2058,14 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
COPY(bframes);
COPY(prev_zone);
COPY(qpbuf_pos);
+ /* these vars can be updated by x264_ratecontrol_init_reconfigurable */
+ COPY(buffer_rate);
+ COPY(buffer_size);
+ COPY(single_frame_vbv);
+ COPY(cbr_decay);
+ COPY(b_vbv_min_rate);
+ COPY(rate_factor_constant);
+ COPY(bitrate);
#undef COPY
}
if( cur != next )
@@ -2047,8 +2073,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
#define COPY(var) next->rc->var = cur->rc->var
/* these vars are updated in x264_ratecontrol_end()
* so copy them from the context that most recently ended (cur)
- * to the context that's about to end (next)
- */
+ * to the context that's about to end (next) */
COPY(cplxr_sum);
COPY(expected_bits_sum);
COPY(wanted_bits_window);
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
index 5a8d088..2767866 100644
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@@ -27,6 +27,8 @@
int x264_ratecontrol_new ( x264_t * );
void x264_ratecontrol_delete( x264_t * );
+void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init );
+
void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
void x264_adaptive_quant( x264_t * );
int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 057f6a6..bb2ed64 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -1394,10 +1394,10 @@ int x264_rc_analyse_slice( x264_t *h )
int mb_xy = y * h->mb.i_mb_stride;
for( x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
{
- int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor) >> 8;
+ int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8;
int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
int diff = intra_cost - inter_cost;
- h->fdec->i_row_satd[y] += diff;
+ h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8;
cost += diff;
}
}
diff --git a/extras/gas-preprocessor.pl b/extras/gas-preprocessor.pl
new file mode 100755
index 0000000..d60893c
--- /dev/null
+++ b/extras/gas-preprocessor.pl
@@ -0,0 +1,256 @@
+#!/usr/bin/env perl
+# by David Conrad
+# This code is licensed under GPLv2 or later; go to gnu.org to read it
+# (not that it much matters for an asm preprocessor)
+# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
+use strict;
+
+# Apple's gas is ancient and doesn't support modern preprocessing features like
+# .rept and has ugly macro syntax, among other things. Thus, this script
+# implements the subset of the gas preprocessor used by x264 and ffmpeg
+# that isn't supported by Apple's gas.
+
+# FIXME: doesn't work if the path has spaces, but oh well...
+my $gcc_cmd = join(' ', @ARGV);
+my $preprocess_c_cmd;
+
+if ($gcc_cmd =~ /\S+\.c/) {
+ # C file (inline asm?) - compile
+ $preprocess_c_cmd = "$gcc_cmd -S";
+ $gcc_cmd =~ s/\S+\.c/-x assembler -/g;
+} elsif ($gcc_cmd =~ /\S+\.S/) {
+ # asm file, just do C preprocessor
+ $preprocess_c_cmd = "$gcc_cmd -E";
+ $gcc_cmd =~ s/\S+\.S/-x assembler -/g;
+} else {
+ die "Unrecognized input filetype";
+}
+
+$preprocess_c_cmd =~ s/\S+\.o/-/g;
+
+open(ASMFILE, "-|", $preprocess_c_cmd) || die "Error running preprocessor";
+
+my $current_macro = '';
+my %macro_lines;
+my %macro_args;
+my %macro_args_default;
+
+my @pass1_lines;
+
+# pass 1: parse .macro
+# note that the handling of arguments is probably overly permissive vs. gas
+# but it should be the same for valid cases
+while (<ASMFILE>) {
+ # comment out unsupported directives
+ s/\.type/@.type/x;
+ s/\.func/@.func/x;
+ s/\.endfunc/@.endfunc/x;
+ s/\.ltorg/@.ltorg/x;
+ s/\.size/@.size/x;
+ s/\.fpu/@.fpu/x;
+
+ # the syntax for these is a little different
+ s/\.global/.globl/x;
+ # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const
+ s/(.*)\.rodata/.const_data/x;
+ s/\.int/.long/x;
+ s/\.float/.single/x;
+
+ # catch unknown section names that aren't mach-o style (with a comma)
+ if (/.section ([^,]*)$/) {
+ die ".section $1 unsupported; figure out the mach-o section name and add it";
+ }
+
+ # macros creating macros is not handled (is that valid?)
+ if (/\.macro\s+([\d\w\.]+)\s*(.*)/) {
+ $current_macro = $1;
+
+ # commas in the argument list are optional, so only use whitespace as the separator
+ my $arglist = $2;
+ $arglist =~ s/,/ /g;
+
+ my @args = split(/\s+/, $arglist);
+ foreach my $i (0 .. $#args) {
+ my @argpair = split(/=/, $args[$i]);
+ $macro_args{$current_macro}[$i] = $argpair[0];
+ $argpair[0] =~ s/:vararg$//;
+ $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1];
+ }
+ # ensure %macro_lines has the macro name added as a key
+ $macro_lines{$current_macro} = [];
+ } elsif (/\.endm/) {
+ if (!$current_macro) {
+ die "ERROR: .endm without .macro";
+ }
+ $current_macro = '';
+ } elsif ($current_macro) {
+ push(@{$macro_lines{$current_macro}}, $_);
+ } else {
+ expand_macros($_);
+ }
+}
+
+sub expand_macros {
+ my $line = @_[0];
+ if ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) {
+ push(@pass1_lines, $1);
+ my $macro = $2;
+
+ # commas are optional here too, but are syntactically important because
+ # parameters can be blank
+ my @arglist = split(/,/, $3);
+ my @args;
+ foreach (@arglist) {
+ my @whitespace_split = split(/\s+/, $_);
+ if (!@whitespace_split) {
+ push(@args, '');
+ } else {
+ foreach (@whitespace_split) {
+ if (length($_)) {
+ push(@args, $_);
+ }
+ }
+ }
+ }
+
+ my %replacements;
+ if ($macro_args_default{$macro}){
+ %replacements = %{$macro_args_default{$macro}};
+ }
+
+ # construct hashtable of text to replace
+ foreach my $i (0 .. $#args) {
+ my $argname = $macro_args{$macro}[$i];
+
+ if ($args[$i] =~ m/=/) {
+ # arg=val references the argument name
+ # XXX: I'm not sure what the expected behaviour if a lot of
+ # these are mixed with unnamed args
+ my @named_arg = split(/=/, $args[$i]);
+ $replacements{$named_arg[0]} = $named_arg[1];
+ } elsif ($i > $#{$macro_args{$macro}}) {
+ # more args given than the macro has named args
+ # XXX: is vararg allowed on arguments before the last?
+ $argname = $macro_args{$macro}[-1];
+ if ($argname =~ s/:vararg$//) {
+ $replacements{$argname} .= ", $args[$i]";
+ } else {
+ die "Too many arguments to macro $macro";
+ }
+ } else {
+ $argname =~ s/:vararg$//;
+ $replacements{$argname} = $args[$i];
+ }
+ }
+
+ # apply replacements as regex
+ foreach (@{$macro_lines{$macro}}) {
+ my $macro_line = $_;
+ # do replacements by longest first, this avoids wrong replacement
+ # when argument names are subsets of each other
+ foreach (reverse sort {length $a <=> length $b} keys %replacements) {
+ $macro_line =~ s/\\$_/$replacements{$_}/g;
+ }
+ $macro_line =~ s/\\\(\)//g; # remove \()
+ expand_macros($macro_line);
+ }
+ } else {
+ push(@pass1_lines, $line);
+ }
+}
+
+close(ASMFILE) or exit 1;
+open(ASMFILE, "|-", $gcc_cmd) or die "Error running assembler";
+
+my @sections;
+my $num_repts;
+my $rept_lines;
+
+my %literal_labels; # for ldr <reg>, =<expr>
+my $literal_num = 0;
+
+# pass 2: parse .rept and .if variants
+# NOTE: since we don't implement a proper parser, using .rept with a
+# variable assigned from .set is not supported
+foreach my $line (@pass1_lines) {
+ # textual comparison .if
+ # this assumes nothing else on the same line
+ if ($line =~ /\.ifnb\s+(.*)/) {
+ if ($1) {
+ $line = ".if 1\n";
+ } else {
+ $line = ".if 0\n";
+ }
+ } elsif ($line =~ /\.ifb\s+(.*)/) {
+ if ($1) {
+ $line = ".if 0\n";
+ } else {
+ $line = ".if 1\n";
+ }
+ } elsif ($line =~ /\.ifc\s+(.*)\s*,\s*(.*)/) {
+ if ($1 eq $2) {
+ $line = ".if 1\n";
+ } else {
+ $line = ".if 0\n";
+ }
+ }
+
+ # handle .previous (only with regard to .section not .subsection)
+ if ($line =~ /\.(section|text|const_data)/) {
+ push(@sections, $line);
+ } elsif ($line =~ /\.previous/) {
+ if (!$sections[-2]) {
+ die ".previous without a previous section";
+ }
+ $line = $sections[-2];
+ push(@sections, $line);
+ }
+
+ # handle ldr <reg>, =<expr>
+ if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/) {
+ my $label = $literal_labels{$3};
+ if (!$label) {
+ $label = ".Literal_$literal_num";
+ $literal_num++;
+ $literal_labels{$3} = $label;
+ }
+ $line = "$1 ldr$2, $label\n";
+ } elsif ($line =~ /\.ltorg/) {
+ foreach my $literal (keys %literal_labels) {
+ $line .= "$literal_labels{$literal}:\n .word $literal\n";
+ }
+ %literal_labels = ();
+ }
+
+ # @l -> lo16() @ha -> ha16()
+ $line =~ s/,\s+([^,]+)\@l(\s)/, lo16($1)$2/g;
+ $line =~ s/,\s+([^,]+)\@ha(\s)/, ha16($1)$2/g;
+
+ if ($line =~ /\.rept\s+(.*)/) {
+ $num_repts = $1;
+ $rept_lines = "\n";
+
+ # handle the possibility of repeating another directive on the same line
+ # .endr on the same line is not valid, I don't know if a non-directive is
+ if ($num_repts =~ s/(\.\w+.*)//) {
+ $rept_lines .= "$1\n";
+ }
+ $num_repts = eval($num_repts);
+ } elsif ($line =~ /\.endr/) {
+ for (1 .. $num_repts) {
+ print ASMFILE $rept_lines;
+ }
+ $rept_lines = '';
+ } elsif ($rept_lines) {
+ $rept_lines .= $line;
+ } else {
+ print ASMFILE $line;
+ }
+}
+
+print ASMFILE ".text\n";
+foreach my $literal (keys %literal_labels) {
+ print ASMFILE "$literal_labels{$literal}:\n .word $literal\n";
+}
+
+close(ASMFILE) or exit 1;
diff --git a/input/avs.c b/input/avs.c
index 522f8fe..79b5c80 100644
--- a/input/avs.c
+++ b/input/avs.c
@@ -313,4 +313,4 @@ static int close_file( hnd_t handle )
return 0;
}
-cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
+const cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
diff --git a/input/ffms.c b/input/ffms.c
index b680967..14962c7 100644
--- a/input/ffms.c
+++ b/input/ffms.c
@@ -244,4 +244,4 @@ static int close_file( hnd_t handle )
return 0;
}
-cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
+const cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/input/input.h b/input/input.h
index 9fb425c..6e386f4 100644
--- a/input/input.h
+++ b/input/input.h
@@ -60,11 +60,11 @@ typedef struct
int (*close_file)( hnd_t handle );
} cli_input_t;
-extern cli_input_t yuv_input;
-extern cli_input_t y4m_input;
-extern cli_input_t avs_input;
+extern const cli_input_t yuv_input;
+extern const cli_input_t y4m_input;
+extern const cli_input_t avs_input;
extern cli_input_t thread_input;
-extern cli_input_t lavf_input;
-extern cli_input_t ffms_input;
+extern const cli_input_t lavf_input;
+extern const cli_input_t ffms_input;
#endif
diff --git a/input/lavf.c b/input/lavf.c
index 180e509..6ecc6b0 100644
--- a/input/lavf.c
+++ b/input/lavf.c
@@ -269,4 +269,4 @@ static int close_file( hnd_t handle )
return 0;
}
-cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
+const cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
diff --git a/input/y4m.c b/input/y4m.c
index 1619f74..8645ff7 100644
--- a/input/y4m.c
+++ b/input/y4m.c
@@ -242,4 +242,4 @@ static int close_file( hnd_t handle )
return 0;
}
-cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
+const cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/input/yuv.c b/input/yuv.c
index dbd0317..3e39e07 100644
--- a/input/yuv.c
+++ b/input/yuv.c
@@ -125,4 +125,4 @@ static int close_file( hnd_t handle )
return 0;
}
-cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
+const cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/output/flv.c b/output/flv.c
index b3e5d16..2e0a0e4 100644
--- a/output/flv.c
+++ b/output/flv.c
@@ -305,4 +305,4 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
return 0;
}
-cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
+const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/output/matroska.c b/output/matroska.c
index 8e84f52..fb39ced 100644
--- a/output/matroska.c
+++ b/output/matroska.c
@@ -146,7 +146,7 @@ static int write_headers( hnd_t handle, x264_nal_t *p_nal )
memcpy( avcC+11+sps_size, pps, pps_size );
- ret = mk_writeHeader( p_mkv->w, "x264", "V_MPEG4/ISO/AVC",
+ ret = mk_writeHeader( p_mkv->w, "x264" X264_VERSION, "V_MPEG4/ISO/AVC",
avcC, avcC_len, p_mkv->frame_duration, 50000,
p_mkv->width, p_mkv->height,
p_mkv->d_width, p_mkv->d_height );
@@ -185,7 +185,7 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
p_mkv->b_writing_frame = 0;
- if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe ) < 0 )
+ if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe, p_picture->i_type == X264_TYPE_B ) < 0 )
return -1;
return i_size;
@@ -206,4 +206,4 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
return ret;
}
-cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
+const cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
index d1c6e13..7265909 100644
--- a/output/matroska_ebml.c
+++ b/output/matroska_ebml.c
@@ -53,9 +53,9 @@ struct mk_writer
int64_t def_duration;
int64_t timescale;
int64_t cluster_tc_scaled;
- int64_t frame_tc, prev_frame_tc_scaled, max_frame_tc;
+ int64_t frame_tc, max_frame_tc;
- char wrote_header, in_frame, keyframe;
+ char wrote_header, in_frame, keyframe, skippable;
};
static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id )
@@ -258,23 +258,6 @@ static int mk_write_uint( mk_context *c, unsigned id, int64_t ui )
return 0;
}
-static int mk_write_sint( mk_context *c, unsigned id, int64_t si )
-{
- unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
- unsigned i = 0;
-
- CHECK( mk_write_id( c, id ) );
- if( si < 0 )
- while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
- ++i;
- else
- while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80 ) )
- ++i;
- CHECK( mk_write_size( c, 8 - i ) );
- CHECK( mk_append_context_data( c, c_si+i, 8 - i ) );
- return 0;
-}
-
static int mk_write_float_raw( mk_context *c, float f )
{
union
@@ -301,34 +284,6 @@ static int mk_write_float( mk_context *c, unsigned id, float f )
return 0;
}
-static unsigned mk_ebml_size_size( unsigned s )
-{
- if( s < 0x7f )
- return 1;
- if( s < 0x3fff )
- return 2;
- if( s < 0x1fffff )
- return 3;
- if( s < 0x0fffffff )
- return 4;
- return 5;
-}
-
-static unsigned mk_ebml_sint_size( int64_t si )
-{
- unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
- unsigned i = 0;
-
- if( si < 0 )
- while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
- ++i;
- else
- while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80) )
- ++i;
-
- return 8 - i;
-}
-
mk_writer *mk_create_writer( const char *filename )
{
mk_writer *w = malloc( sizeof(*w) );
@@ -446,8 +401,8 @@ static int mk_close_cluster( mk_writer *w )
static int mk_flush_frame( mk_writer *w )
{
- int64_t delta, ref = 0;
- unsigned fsize, bgsize;
+ int64_t delta;
+ unsigned fsize;
unsigned char c_delta_flags[3];
if( !w->in_frame )
@@ -470,33 +425,22 @@ static int mk_flush_frame( mk_writer *w )
}
fsize = w->frame ? w->frame->d_cur : 0;
- bgsize = fsize + 4 + mk_ebml_size_size( fsize + 4 ) + 1;
- if( !w->keyframe )
- {
- ref = w->prev_frame_tc_scaled - w->cluster_tc_scaled - delta;
- bgsize += 1 + 1 + mk_ebml_sint_size( ref );
- }
- CHECK( mk_write_id( w->cluster, 0xa0 ) ); // BlockGroup
- CHECK( mk_write_size( w->cluster, bgsize ) );
- CHECK( mk_write_id( w->cluster, 0xa1 ) ); // Block
+ CHECK( mk_write_id( w->cluster, 0xa3 ) ); // SimpleBlock
CHECK( mk_write_size( w->cluster, fsize + 4 ) );
CHECK( mk_write_size( w->cluster, 1 ) ); // track number
c_delta_flags[0] = delta >> 8;
c_delta_flags[1] = delta;
- c_delta_flags[2] = 0;
+ c_delta_flags[2] = (w->keyframe << 7) | w->skippable;
CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) );
if( w->frame )
{
CHECK( mk_append_context_data( w->cluster, w->frame->data, w->frame->d_cur ) );
w->frame->d_cur = 0;
}
- if( !w->keyframe )
- CHECK( mk_write_sint( w->cluster, 0xfb, ref ) ); // ReferenceBlock
w->in_frame = 0;
- w->prev_frame_tc_scaled = w->cluster_tc_scaled + delta;
if( w->cluster->d_cur > CLSIZE )
CHECK( mk_close_cluster( w ) );
@@ -509,19 +453,21 @@ int mk_start_frame( mk_writer *w )
if( mk_flush_frame( w ) < 0 )
return -1;
- w->in_frame = 1;
- w->keyframe = 0;
+ w->in_frame = 1;
+ w->keyframe = 0;
+ w->skippable = 0;
return 0;
}
-int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe )
+int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable )
{
if( !w->in_frame )
return -1;
- w->frame_tc = timestamp;
- w->keyframe = keyframe != 0;
+ w->frame_tc = timestamp;
+ w->keyframe = keyframe != 0;
+ w->skippable = skippable != 0;
if( w->max_frame_tc < timestamp )
w->max_frame_tc = timestamp;
diff --git a/output/matroska_ebml.h b/output/matroska_ebml.h
index 252e781..56eb8cc 100644
--- a/output/matroska_ebml.h
+++ b/output/matroska_ebml.h
@@ -35,7 +35,7 @@ int mk_writeHeader( mk_writer *w, const char *writing_app,
int mk_start_frame( mk_writer *w );
int mk_add_frame_data( mk_writer *w, const void *data, unsigned size );
-int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe );
+int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable );
int mk_close( mk_writer *w, int64_t last_delta );
#endif
diff --git a/output/mp4.c b/output/mp4.c
index e3ad9c6..b99eaed 100644
--- a/output/mp4.c
+++ b/output/mp4.c
@@ -121,7 +121,7 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
if( mdhd_duration != total_duration )
{
uint64_t last_dts = gf_isom_get_sample_dts( p_mp4->p_file, p_mp4->i_track, p_mp4->i_numframe );
- uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
+ uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc );
gf_isom_set_last_sample_duration( p_mp4->p_file, p_mp4->i_track, last_duration );
total_duration = gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track );
}
@@ -212,6 +212,7 @@ static int set_param( hnd_t handle, x264_param_t *p_param )
dw *= sar ;
else
dh /= sar;
+ gf_isom_set_pixel_aspect_ratio( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_param->vui.i_sar_width, p_param->vui.i_sar_height );
gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
}
@@ -297,4 +298,4 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
return i_size;
}
-cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
+const cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/output/output.h b/output/output.h
index 851b819..c79b48e 100644
--- a/output/output.h
+++ b/output/output.h
@@ -33,9 +33,9 @@ typedef struct
int (*close_file)( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts );
} cli_output_t;
-extern cli_output_t raw_output;
-extern cli_output_t mkv_output;
-extern cli_output_t mp4_output;
-extern cli_output_t flv_output;
+extern const cli_output_t raw_output;
+extern const cli_output_t mkv_output;
+extern const cli_output_t mp4_output;
+extern const cli_output_t flv_output;
#endif
diff --git a/output/raw.c b/output/raw.c
index a4d1175..02e4c56 100644
--- a/output/raw.c
+++ b/output/raw.c
@@ -62,5 +62,5 @@ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest
return fclose( (FILE*)handle );
}
-cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
+const cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm
index 966615b..1970cb9 100644
--- a/tools/checkasm-a.asm
+++ b/tools/checkasm-a.asm
@@ -71,19 +71,19 @@ cglobal x264_checkasm_call, 4,7,16
%endrep
%assign i 6
%rep 16-6
- movdqa xmm %+ i, [x %+ i GLOBAL]
+ movdqa xmm %+ i, [x %+ i]
%assign i i+1
%endrep
- mov r4, [n4 GLOBAL]
- mov r5, [n5 GLOBAL]
+ mov r4, [n4]
+ mov r5, [n5]
call r6
- xor r4, [n4 GLOBAL]
- xor r5, [n5 GLOBAL]
+ xor r4, [n4]
+ xor r5, [n5]
or r4, r5
pxor xmm5, xmm5
%assign i 6
%rep 16-6
- pxor xmm %+ i, [x %+ i GLOBAL]
+ pxor xmm %+ i, [x %+ i]
por xmm5, xmm %+ i
%assign i i+1
%endrep
@@ -92,7 +92,7 @@ cglobal x264_checkasm_call, 4,7,16
or r4, r5
jz .ok
mov r4, rax
- lea r0, [error_message GLOBAL]
+ lea r0, [error_message]
call puts
mov r1, [rsp+stack_offset+16]
mov dword [r1], 0
@@ -132,7 +132,7 @@ cglobal x264_checkasm_call, 1,7
or r3, r5
jz .ok
mov r3, eax
- lea r1, [error_message GLOBAL]
+ lea r1, [error_message]
push r1
call puts
add esp, 4
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 0bedc5b..595bd9e 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1662,13 +1662,13 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
}
-#elif ARCH_PPC
+#elif defined(ARCH_PPC)
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
fprintf( stderr, "x264: ALTIVEC against C\n" );
ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
}
-#elif ARCH_ARM
+#elif defined(ARCH_ARM)
if( x264_cpu_detect() & X264_CPU_ARMV6 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
if( x264_cpu_detect() & X264_CPU_NEON )
diff --git a/x264.c b/x264.c
index 58bc1f4..959626a 100644
--- a/x264.c
+++ b/x264.c
@@ -250,23 +250,23 @@ static void Help( x264_param_t *defaults, int longhelp )
" - faster:\n"
" --no-mbtree --no-mixed-refs --ref 2\n"
" --subme 4 --weightp 1\n"
- " - fast\n"
+ " - fast:\n"
" --rc-lookahead 30 --ref 2 --subme 6\n"
- " - medium\n"
+ " - medium:\n"
" Default settings apply.\n"
- " - slow\n"
+ " - slow:\n"
" --b-adapt 2 --direct auto --me umh\n"
" --rc-lookahead 50 --ref 5 --subme 8\n"
- " - slower\n"
+ " - slower:\n"
" --b-adapt 2 --direct auto --me umh\n"
" --partitions all --rc-lookahead 60\n"
" --ref 8 --subme 9 --trellis 2\n"
- " - veryslow\n"
+ " - veryslow:\n"
" --b-adapt 2 --bframes 8 --direct auto\n"
" --me umh --merange 24 --partitions all\n"
" --ref 16 --subme 10 --trellis 2\n"
" --rc-lookahead 60\n"
- " - placebo\n"
+ " - placebo:\n"
" --bframes 16 --b-adapt 2 --direct auto\n"
" --slow-firstpass --no-fast-pskip\n"
" --me tesa --merange 24 --partitions all\n"
@@ -281,31 +281,26 @@ static void Help( x264_param_t *defaults, int longhelp )
" Only one psy tuning can be used at a time.\n" );
H2( " - film (psy tuning):\n"
" --deblock -1:-1 --psy-rd <unset>:0.15\n"
- " - animation(psy tuning):\n"
+ " - animation (psy tuning):\n"
" --bframes {+2} --deblock 1:1\n"
" --psy-rd 0.4:<unset> --aq-strength 0.6\n"
" --ref {Double if >1 else 1}\n"
- " - grain(psy tuning):\n"
+ " - grain (psy tuning):\n"
" --aq-strength 0.5 --no-dct-decimate\n"
" --deadzone-inter 6 --deadzone-intra 6\n"
" --deblock -2:-2 --ipratio 1.1 \n"
" --pbratio 1.1 --psy-rd <unset>:0.25\n"
" --qcomp 0.8\n"
- " - psnr(psy tuning):\n"
+ " - psnr (psy tuning):\n"
" --aq-mode 0 --no-psy\n"
- " - ssim(psy tuning):\n"
+ " - ssim (psy tuning):\n"
" --aq-mode 2 --no-psy\n"
" - fastdecode:\n"
" --no-cabac --no-deblock --no-weightb\n"
" --weightp 0\n"
" - zerolatency:\n"
" --bframes 0 --rc-lookahead 0\n"
- " --sync-lookahead 0 --sliced-threads\n"
- " - touhou(psy tuning):\n"
- " --aq-strength 1.3 --deblock -1:-1\n"
- " --partitions {p4x4 if p8x8 set}\n"
- " --psy-rd <unset>:0.2\n"
- " --ref {Double if >1 else 1}\n" );
+ " --sync-lookahead 0 --sliced-threads\n" );
else H0( " - psy tunings: film,animation,grain,psnr,ssim\n"
" - other tunings: fastdecode,zerolatency\n" );
H1( " --slow-firstpass Don't use faster settings with --pass 1\n" );
diff --git a/x264.h b/x264.h
index 2550864..e7d19b7 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
#include <stdarg.h>
-#define X264_BUILD 84
+#define X264_BUILD 85
/* x264_t:
* opaque handler for encoder */
@@ -480,11 +480,12 @@ typedef struct
x264_t *x264_encoder_open( x264_param_t * );
/* x264_encoder_reconfig:
- * analysis-related parameters from x264_param_t are copied.
+ * various parameters from x264_param_t are copied.
* this takes effect immediately, on whichever frame is encoded next;
* due to delay, this may not be the next frame passed to encoder_encode.
* if the change should apply to some particular frame, use x264_picture_t->param instead.
- * returns 0 on success, negative on parameter validation error. */
+ * returns 0 on success, negative on parameter validation error.
+ * not all parameters can be changed; see the actual function for a detailed breakdown. */
int x264_encoder_reconfig( x264_t *, x264_param_t * );
/* x264_encoder_parameters:
* copies the current internal set of parameters to the pointer provided
--
x264 packaging
More information about the pkg-multimedia-commits
mailing list