[SCM] x264/upstream: Imported Upstream version 0.119.2113+gitcc129ad
siretart at users.alioth.debian.org
siretart at users.alioth.debian.org
Sat Dec 31 10:49:37 UTC 2011
The following commit has been merged in the upstream branch:
commit 20038120c998edb4e8b9f63028804556ca5af5d5
Author: Reinhard Tartler <siretart at tauware.de>
Date: Sat Dec 31 09:05:59 2011 +0100
Imported Upstream version 0.119.2113+gitcc129ad
diff --git a/Makefile b/Makefile
index 560ba90..c9505d3 100644
--- a/Makefile
+++ b/Makefile
@@ -22,6 +22,8 @@ SRCCLI = x264.c input/input.c input/timecode.c input/raw.c input/y4m.c \
SRCSO =
+OBJCHK = tools/checkasm.o
+
CONFIG := $(shell cat config.h)
# GPL-only files
@@ -88,7 +90,7 @@ ASFLAGS += -Icommon/x86/
SRCS += common/x86/mc-c.c common/x86/predict-c.c
OBJASM = $(ASMSRC:%.asm=%.o)
$(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm
-checkasm: tools/checkasm-a.o
+OBJCHK += tools/checkasm-a.o
endif
endif
@@ -135,7 +137,7 @@ OBJCLI = $(SRCCLI:%.c=%.o)
OBJSO = $(SRCSO:%.c=%.o)
DEP = depend
-.PHONY: all default fprofiled clean distclean install uninstall dox test testclean lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli
+.PHONY: all default fprofiled clean distclean install uninstall lib-static lib-shared cli install-lib-dev install-lib-static install-lib-shared install-cli
default: $(DEP)
@@ -144,17 +146,26 @@ lib-static: $(LIBX264)
lib-shared: $(SONAME)
$(LIBX264): .depend $(OBJS) $(OBJASM)
+ rm -f $(LIBX264)
$(AR)$@ $(OBJS) $(OBJASM)
$(if $(RANLIB), $(RANLIB) $@)
$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
+ifneq ($(EXE),)
+.PHONY: x264 checkasm
+x264: x264$(EXE)
+checkasm: checkasm$(EXE)
+endif
+
x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
-checkasm: tools/checkasm.o $(LIBX264)
- $(LD)$@ $+ $(LDFLAGS)
+checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
+ $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
+
+$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
%.o: %.asm
$(AS) $(ASFLAGS) -o $@ $<
@@ -204,12 +215,11 @@ endif
clean:
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
- rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o
+ rm -f checkasm checkasm.exe $(OBJCHK)
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
distclean: clean
rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
- rm -rf test/
install-cli: cli
install -d $(DESTDIR)$(bindir)
diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
index f5b90a5..7b2ac9f 100644
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -30,7 +30,7 @@
.macro h264_loop_filter_start
ldr ip, [sp]
ldr ip, [ip]
- vmov.32 d24[0], ip
+ vdup.32 d24, ip
and ip, ip, ip, lsl #16
ands ip, ip, ip, lsl #8
bxlt lr
@@ -197,52 +197,62 @@ function x264_deblock_h_luma_neon
.endfunc
.macro h264_loop_filter_chroma
- vdup.8 d22, r2 // alpha
+ vdup.8 q11, r2 // alpha
vmovl.u8 q12, d24
- vabd.u8 d26, d16, d0 // abs(p0 - q0)
- vmovl.u8 q2, d0
- vabd.u8 d28, d18, d16 // abs(p1 - p0)
- vsubw.u8 q2, q2, d16
- vsli.16 d24, d24, #8
+ vabd.u8 q13, q8, q0 // abs(p0 - q0)
+ vabd.u8 q14, q9, q8 // abs(p1 - p0)
+ vsubl.u8 q2, d0, d16
+ vsubl.u8 q3, d1, d17
+ vsli.16 q12, q12, #8
vshl.i16 q2, q2, #2
- vabd.u8 d30, d2, d0 // abs(q1 - q0)
+ vshl.i16 q3, q3, #2
+ vabd.u8 q15, q1, q0 // abs(q1 - q0)
vaddw.u8 q2, q2, d18
- vclt.u8 d26, d26, d22 // < alpha
+ vaddw.u8 q3, q3, d19
+ vclt.u8 q13, q13, q11 // < alpha
vsubw.u8 q2, q2, d2
- vdup.8 d22, r3 // beta
- vclt.s8 d25, d24, #0
+ vsubw.u8 q3, q3, d3
+ vdup.8 q11, r3 // beta
+ vclt.s8 q10, q12, #0
vrshrn.i16 d4, q2, #3
- vclt.u8 d28, d28, d22 // < beta
- vbic d26, d26, d25
- vclt.u8 d30, d30, d22 // < beta
- vand d26, d26, d28
- vneg.s8 d25, d24
- vand d26, d26, d30
- vmin.s8 d4, d4, d24
+ vrshrn.i16 d5, q3, #3
+ vclt.u8 q14, q14, q11 // < beta
+ vbic q13, q13, q10
+ vclt.u8 q15, q15, q11 // < beta
+ vand q13, q13, q14
+ vneg.s8 q10, q12
+ vand q13, q13, q15
+ vmin.s8 q2, q2, q12
vmovl.u8 q14, d16
- vand d4, d4, d26
- vmax.s8 d4, d4, d25
+ vand q2, q2, q13
+ vmovl.u8 q15, d17
+ vmax.s8 q2, q2, q10
vmovl.u8 q11, d0
+ vmovl.u8 q12, d1
vaddw.s8 q14, q14, d4
+ vaddw.s8 q15, q15, d5
vsubw.s8 q11, q11, d4
+ vsubw.s8 q12, q12, d5
vqmovun.s16 d16, q14
+ vqmovun.s16 d17, q15
vqmovun.s16 d0, q11
+ vqmovun.s16 d1, q12
.endm
function x264_deblock_v_chroma_neon
h264_loop_filter_start
sub r0, r0, r1, lsl #1
- vld1.64 {d18}, [r0,:64], r1
- vld1.64 {d16}, [r0,:64], r1
- vld1.64 {d0}, [r0,:64], r1
- vld1.64 {d2}, [r0,:64]
+ vld2.8 {d18,d19}, [r0,:128], r1
+ vld2.8 {d16,d17}, [r0,:128], r1
+ vld2.8 {d0, d1}, [r0,:128], r1
+ vld2.8 {d2, d3}, [r0,:128]
h264_loop_filter_chroma
sub r0, r0, r1, lsl #1
- vst1.64 {d16}, [r0,:64], r1
- vst1.64 {d0}, [r0,:64], r1
+ vst2.8 {d16,d17}, [r0,:128], r1
+ vst2.8 {d0, d1}, [r0,:128], r1
bx lr
.endfunc
@@ -250,37 +260,47 @@ function x264_deblock_v_chroma_neon
function x264_deblock_h_chroma_neon
h264_loop_filter_start
- sub r0, r0, #2
- vld1.32 {d18[]}, [r0], r1
- vld1.32 {d16[]}, [r0], r1
- vld1.32 {d0[]}, [r0], r1
- vld1.32 {d2[]}, [r0], r1
- vld1.32 {d18[1]}, [r0], r1
- vld1.32 {d16[1]}, [r0], r1
- vld1.32 {d0[1]}, [r0], r1
- vld1.32 {d2[1]}, [r0], r1
-
- vtrn.16 d18, d0
- vtrn.16 d16, d2
- vtrn.8 d18, d16
- vtrn.8 d0, d2
+ sub r0, r0, #4
+ vld1.8 {d18}, [r0], r1
+ vld1.8 {d16}, [r0], r1
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d19}, [r0], r1
+ vld1.8 {d17}, [r0], r1
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d3}, [r0], r1
+
+ vuzp.8 d18, d19
+ vuzp.8 d16, d17
+ vuzp.8 d0, d1
+ vuzp.8 d2, d3
+
+ vtrn.16 q9, q0
+ vtrn.16 q8, q1
+ vtrn.8 q9, q8
+ vtrn.8 q0, q1
h264_loop_filter_chroma
- vtrn.16 d18, d0
- vtrn.16 d16, d2
- vtrn.8 d18, d16
- vtrn.8 d0, d2
+ vtrn.16 q9, q0
+ vtrn.16 q8, q1
+ vtrn.8 q9, q8
+ vtrn.8 q0, q1
+
+ vzip.8 d18, d19
+ vzip.8 d16, d17
+ vzip.8 d0, d1
+ vzip.8 d2, d3
sub r0, r0, r1, lsl #3
- vst1.32 {d18[0]}, [r0], r1
- vst1.32 {d16[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d2[0]}, [r0], r1
- vst1.32 {d18[1]}, [r0], r1
- vst1.32 {d16[1]}, [r0], r1
- vst1.32 {d0[1]}, [r0], r1
- vst1.32 {d2[1]}, [r0], r1
+ vst1.8 {d18}, [r0], r1
+ vst1.8 {d16}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d2}, [r0], r1
+ vst1.8 {d19}, [r0], r1
+ vst1.8 {d17}, [r0], r1
+ vst1.8 {d1}, [r0], r1
+ vst1.8 {d3}, [r0], r1
bx lr
.endfunc
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index c437dd3..c1fc05c 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -210,7 +210,8 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
return;
#if !HIGH_BIT_DEPTH
- pf->prefetch_fenc = x264_prefetch_fenc_arm;
+ pf->prefetch_fenc_420 = x264_prefetch_fenc_arm;
+ pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */
pf->prefetch_ref = x264_prefetch_ref_arm;
#endif // !HIGH_BIT_DEPTH
diff --git a/common/common.c b/common/common.c
index 4c978d3..5c9a72d 100644
--- a/common/common.c
+++ b/common/common.c
@@ -35,6 +35,8 @@
const int x264_bit_depth = BIT_DEPTH;
+const int x264_chroma_format = X264_CHROMA_FORMAT;
+
static void x264_log_default( void *, int, const char *, va_list );
/****************************************************************************
@@ -52,7 +54,7 @@ void x264_param_default( x264_param_t *param )
param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
/* Video properties */
- param->i_csp = X264_CSP_I420;
+ param->i_csp = X264_CHROMA_FORMAT ? X264_CHROMA_FORMAT : X264_CSP_I420;
param->i_width = 0;
param->i_height = 0;
param->vui.i_sar_width = 0;
diff --git a/common/common.h b/common/common.h
index d1f830f..5763c2e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -40,9 +40,6 @@
#define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
#define FIX8(f) ((int)(f*(1<<8)+.5))
#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
-#define CHROMA_FORMAT h->sps->i_chroma_format_idc
-#define CHROMA_SIZE(s) ((s)>>(h->mb.chroma_h_shift+h->mb.chroma_v_shift))
-#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
#define CHECKED_MALLOC( var, size )\
do {\
@@ -105,6 +102,17 @@ do {\
# define PARAM_INTERLACED 0
#endif
+#ifdef CHROMA_FORMAT
+# define CHROMA_H_SHIFT (CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422)
+# define CHROMA_V_SHIFT (CHROMA_FORMAT == CHROMA_420)
+#else
+# define CHROMA_FORMAT h->sps->i_chroma_format_idc
+# define CHROMA_H_SHIFT h->mb.chroma_h_shift
+# define CHROMA_V_SHIFT h->mb.chroma_v_shift
+#endif
+
+#define CHROMA_SIZE(s) ((s)>>(CHROMA_H_SHIFT+CHROMA_V_SHIFT))
+#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
#define CHROMA444 (CHROMA_FORMAT == CHROMA_444)
/* Unions for type-punning.
diff --git a/common/cpu.c b/common/cpu.c
index f4d7dce..8164045 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -63,6 +63,8 @@ const x264_cpu_name_t x264_cpu_names[] =
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"AVX", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
+ {"XOP", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_XOP},
+ {"FMA4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_FMA4},
#undef SSE2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
@@ -175,6 +177,14 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSE_MISALIGN;
x264_cpu_mask_misalign_sse();
}
+
+ if( cpu & X264_CPU_AVX )
+ {
+ if( ecx&0x00000800 ) /* XOP */
+ cpu |= X264_CPU_XOP;
+ if( ecx&0x00010000 ) /* FMA4 */
+ cpu |= X264_CPU_FMA4;
+ }
}
}
diff --git a/common/dct.c b/common/dct.c
index cf8a235..e62ec06 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -887,6 +887,8 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
}
+ if( cpu&X264_CPU_XOP )
+ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
diff --git a/common/deblock.c b/common/deblock.c
index c38a9d0..2ae5f75 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -394,7 +394,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
int stridey = h->fdec->i_stride[0];
int strideuv = h->fdec->i_stride[1];
int chroma444 = CHROMA444;
- int chroma_height = 16 >> h->mb.chroma_v_shift;
+ int chroma_height = 16 >> CHROMA_V_SHIFT;
intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1;
for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
@@ -484,7 +484,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
}
int offy = MB_INTERLACED ? 4 : 0;
- int offuv = MB_INTERLACED ? 4-h->mb.chroma_v_shift : 0;
+ int offuv = MB_INTERLACED ? 4-CHROMA_V_SHIFT : 0;
left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
luma_qp[1] = (qp + left_qp[1] + 1) >> 1;
chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
@@ -647,6 +647,9 @@ void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, in
void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
@@ -736,6 +739,9 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
+#if !HIGH_BIT_DEPTH
+ pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
+#endif
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
@@ -745,12 +751,15 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
+ pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
+#if !HIGH_BIT_DEPTH
+ pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
+#endif
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
- pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
@@ -762,12 +771,15 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
if( cpu&X264_CPU_AVX )
{
pf->deblock_strength = x264_deblock_strength_avx;
+ pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
+#if !HIGH_BIT_DEPTH
+ pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
+#endif
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
pf->deblock_luma[0] = x264_deblock_h_luma_avx;
pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
- pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
@@ -791,8 +803,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
{
pf->deblock_luma[1] = x264_deblock_v_luma_neon;
pf->deblock_luma[0] = x264_deblock_h_luma_neon;
-// pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
-// pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
+ pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
+ pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
}
#endif
#endif // !HIGH_BIT_DEPTH
diff --git a/common/frame.c b/common/frame.c
index 254b962..04e8afa 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -373,7 +373,7 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
}
else
{
- int v_shift = h->mb.chroma_v_shift;
+ int v_shift = CHROMA_V_SHIFT;
get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
stride[0]/sizeof(pixel), h->param.i_width, h->param.i_height );
@@ -486,8 +486,8 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
return;
for( int i = 0; i < frame->i_plane; i++ )
{
- int h_shift = i && h->mb.chroma_h_shift;
- int v_shift = i && h->mb.chroma_v_shift;
+ int h_shift = i && CHROMA_H_SHIFT;
+ int v_shift = i && CHROMA_V_SHIFT;
int stride = frame->i_stride[i];
int width = 16*h->mb.i_mb_width;
int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
@@ -554,9 +554,9 @@ void x264_frame_expand_border_lowres( x264_frame_t *frame )
void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane )
{
- int v_shift = h->mb.chroma_v_shift;
+ int v_shift = CHROMA_V_SHIFT;
plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift,
- PADH, PADV>>v_shift, 1, 1, h->mb.chroma_h_shift );
+ PADH, PADV>>v_shift, 1, 1, CHROMA_H_SHIFT );
}
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
@@ -564,8 +564,8 @@ void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
for( int i = 0; i < frame->i_plane; i++ )
{
int i_width = h->param.i_width;
- int h_shift = i && h->mb.chroma_h_shift;
- int v_shift = i && h->mb.chroma_v_shift;
+ int h_shift = i && CHROMA_H_SHIFT;
+ int v_shift = i && CHROMA_V_SHIFT;
int i_height = h->param.i_height >> v_shift;
int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
@@ -591,7 +591,7 @@ void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y )
{
for( int i = 0; i < h->fenc->i_plane; i++ )
{
- int v_shift = i && h->mb.chroma_v_shift;
+ int v_shift = i && CHROMA_V_SHIFT;
int stride = h->fenc->i_stride[i];
int height = h->param.i_height >> v_shift;
int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
diff --git a/common/macroblock.c b/common/macroblock.c
index fd1b90b..a183cf1 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -51,7 +51,7 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h
}
else
{
- int v_shift = h->mb.chroma_v_shift;
+ int v_shift = CHROMA_V_SHIFT;
// Chroma in 4:2:0 is offset if MCing from a field of opposite parity
if( v_shift & MB_INTERLACED & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
@@ -90,7 +90,7 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h
}
else
{
- int v_shift = h->mb.chroma_v_shift;
+ int v_shift = CHROMA_V_SHIFT;
if( v_shift & MB_INTERLACED & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
@@ -135,7 +135,7 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
}
else
{
- int v_shift = h->mb.chroma_v_shift;
+ int v_shift = CHROMA_V_SHIFT;
if( v_shift & MB_INTERLACED & i_ref0 )
mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
if( v_shift & MB_INTERLACED & i_ref1 )
@@ -541,7 +541,7 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
int stride_y = fenc->i_stride[0];
int stride_uv = fenc->i_stride[1];
int off_y = 16 * i_mb_x + 16 * i_mb_y * stride_y;
- int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> h->mb.chroma_v_shift);
+ int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> CHROMA_V_SHIFT);
h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
fenc->plane[1]+off_uv, stride_uv, i_mb_x );
}
@@ -556,7 +556,7 @@ NOINLINE void x264_copy_column8( pixel *dst, pixel *src )
static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
{
int mb_interlaced = b_mbaff && MB_INTERLACED;
- int height = b_chroma ? 16 >> h->mb.chroma_v_shift : 16;
+ int height = b_chroma ? 16 >> CHROMA_V_SHIFT : 16;
int i_stride = h->fdec->i_stride[i];
int i_stride2 = i_stride << mb_interlaced;
int i_pix_offset = mb_interlaced
@@ -873,8 +873,8 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
/* load non_zero_count */
CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] );
- CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>h->mb.chroma_v_shift)] );
- CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>h->mb.chroma_v_shift)] );
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>CHROMA_V_SHIFT)] );
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>CHROMA_V_SHIFT)] );
/* Finish the prefetching */
for( int l = 0; l < lists; l++ )
@@ -927,7 +927,7 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
if( CHROMA_FORMAT >= CHROMA_422 )
{
- int offset = (4>>h->mb.chroma_h_shift) - 4;
+ int offset = (4>>CHROMA_H_SHIFT) - 4;
h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16+offset];
h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16+offset];
h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16+offset];
@@ -1449,7 +1449,7 @@ void x264_macroblock_deblock_strength( x264_t *h )
/* Early termination: in this case, nnz guarantees all edges use strength 2.*/
if( h->mb.b_transform_8x8 && !CHROMA444 )
{
- int cbp_mask = 0xf >> h->mb.chroma_v_shift;
+ int cbp_mask = 0xf >> CHROMA_V_SHIFT;
if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask )
{
M32( bs[0][0] ) = 0x02020202;
@@ -1620,7 +1620,7 @@ void x264_macroblock_deblock_strength( x264_t *h )
static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
{
- int height = b_chroma ? 16>>h->mb.chroma_v_shift : 16;
+ int height = b_chroma ? 16>>CHROMA_V_SHIFT : 16;
int i_stride = h->fdec->i_stride[i];
int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED);
int i_pix_offset = (b_mbaff && MB_INTERLACED)
@@ -1647,7 +1647,7 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
}
else
{
- int backup_src = (15>>h->mb.chroma_v_shift) * FDEC_STRIDE;
+ int backup_src = (15>>CHROMA_V_SHIFT) * FDEC_STRIDE;
memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) );
memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) );
}
@@ -1677,8 +1677,8 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int
/* In progressive we update intra_border_backup in-place, so the topleft neighbor will
* no longer exist there when load_pic_pointers wants it. Move it within p_fdec instead. */
h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15];
- h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+(15>>h->mb.chroma_h_shift)];
- h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+(15>>h->mb.chroma_h_shift)];
+ h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+(15>>CHROMA_H_SHIFT)];
+ h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+(15>>CHROMA_H_SHIFT)];
}
}
diff --git a/common/mc.c b/common/mc.c
index c2b77f5..314c6f9 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -304,9 +304,9 @@ void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
}
}
-void x264_plane_copy_deinterleave_c( pixel *dstu, int i_dstu,
- pixel *dstv, int i_dstv,
- pixel *src, int i_src, int w, int h )
+static void x264_plane_copy_deinterleave_c( pixel *dstu, int i_dstu,
+ pixel *dstv, int i_dstv,
+ pixel *src, int i_src, int w, int h )
{
for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
for( int x=0; x<w; x++ )
@@ -316,10 +316,10 @@ void x264_plane_copy_deinterleave_c( pixel *dstu, int i_dstu,
}
}
-void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta,
- pixel *dstb, int i_dstb,
- pixel *dstc, int i_dstc,
- pixel *src, int i_src, int pw, int w, int h )
+static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta,
+ pixel *dstb, int i_dstb,
+ pixel *dstc, int i_dstc,
+ pixel *src, int i_src, int pw, int w, int h )
{
for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
{
@@ -506,7 +506,8 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->hpel_filter = hpel_filter;
- pf->prefetch_fenc = prefetch_fenc_null;
+ pf->prefetch_fenc_420 = prefetch_fenc_null;
+ pf->prefetch_fenc_422 = prefetch_fenc_null;
pf->prefetch_ref = prefetch_ref_null;
pf->memcpy_aligned = memcpy;
pf->memzero_aligned = memzero_aligned;
diff --git a/common/mc.h b/common/mc.h
index 09dda55..40fb591 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -103,6 +103,10 @@ typedef struct
/* prefetch the next few macroblocks of fenc or fdec */
void (*prefetch_fenc)( pixel *pix_y, int stride_y,
pixel *pix_uv, int stride_uv, int mb_x );
+ void (*prefetch_fenc_420)( pixel *pix_y, int stride_y,
+ pixel *pix_uv, int stride_uv, int mb_x );
+ void (*prefetch_fenc_422)( pixel *pix_y, int stride_y,
+ pixel *pix_uv, int stride_uv, int mb_x );
/* prefetch the next few macroblocks of a hpel reference frame */
void (*prefetch_ref)( pixel *pix, int stride, int parity );
diff --git a/common/pixel.c b/common/pixel.c
index 19d6d2f..f65e2d7 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -496,6 +496,7 @@ SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL7( _sse4 )
SATD_X_DECL7( _avx )
+SATD_X_DECL7( _xop )
#endif // !HIGH_BIT_DEPTH
#endif
@@ -546,7 +547,8 @@ INTRA_MBCMP(satd, 8x16, dc, h, v, c,, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, ,, _c )
INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c )
-#if HIGH_BIT_DEPTH && HAVE_MMX
+#if HAVE_MMX
+#if HIGH_BIT_DEPTH
INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP(satd, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c )
@@ -558,6 +560,17 @@ INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 )
INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 )
+#else
+#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_mmx
+INTRA_MBCMP( sad, 8x16, dc, h, v, c, _mmx2, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _mmx2, _mmx2 )
+INTRA_MBCMP( sad, 8x16, dc, h, v, c, _sse2, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse2, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _ssse3, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse4, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _mmx2 )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 )
+#endif
#endif
// No C implementation of intra_satd_x9. See checkasm for its behavior,
@@ -819,17 +832,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( sad, _mmx2 );
INIT7( sad_x3, _mmx2 );
INIT7( sad_x4, _mmx2 );
- INIT7( satd, _mmx2 );
+ INIT8( satd, _mmx2 );
INIT7( satd_x3, _mmx2 );
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
- INIT7( ssd, _mmx2 );
+ INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
+#if ARCH_X86
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
+#endif
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
@@ -855,6 +871,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
@@ -940,7 +957,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8_NAME( sad_aligned, sad, _mmx2 );
INIT7( sad_x3, _mmx2 );
INIT7( sad_x4, _mmx2 );
- INIT7( satd, _mmx2 );
+ INIT8( satd, _mmx2 );
INIT7( satd_x3, _mmx2 );
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
@@ -955,6 +972,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
pixf->vsad = x264_pixel_vsad_mmx2;
if( cpu&X264_CPU_CACHELINE_32 )
@@ -983,6 +1001,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_mmx2;
+ pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_mmx2;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
@@ -1004,6 +1024,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
pixf->vsad = x264_pixel_vsad_sse2;
}
@@ -1013,6 +1034,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT6( satd, _sse2 );
+ pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
INIT6( satd_x3, _sse2 );
INIT6( satd_x4, _sse2 );
if( !(cpu&X264_CPU_STACK_MOD4) )
@@ -1023,6 +1045,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse2;
+ pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_sse2;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( ssd, _sse2); /* faster for width 16 on p4 */
@@ -1071,6 +1095,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _ssse3 );
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_ssse3;
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3;
+ pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_ssse3;
+#if ARCH_X86_64
+ pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3;
+#endif
}
INIT_ADS( _ssse3 );
if( !(cpu&X264_CPU_SLOW_ATOM) )
@@ -1078,21 +1106,17 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( ssd, _ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
- INIT7( satd, _ssse3 );
+ INIT8( satd, _ssse3 );
INIT7( satd_x3, _ssse3 );
INIT7( satd_x4, _ssse3 );
}
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
- pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
-#if ARCH_X86_64
- pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
-#endif
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
- if( cpu&X264_CPU_SHUFFLE_IS_FAST )
- pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
@@ -1107,7 +1131,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_SSE4 )
{
- INIT7( satd, _sse4 );
+ INIT8( satd, _sse4 );
INIT7( satd_x3, _sse4 );
INIT7( satd_x4, _sse4 );
if( !(cpu&X264_CPU_STACK_MOD4) )
@@ -1115,15 +1139,19 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _sse4 );
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_sse4;
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4;
+ pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_sse4;
+#if ARCH_X86_64
+ pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_sse4;
+#endif
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
- pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
}
if( cpu&X264_CPU_AVX )
{
- INIT7( satd, _avx );
+ INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
INIT_ADS( _avx );
@@ -1132,21 +1160,42 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _avx );
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_avx;
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx;
+ pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx;
+#if ARCH_X86_64
+ pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_avx;
+#endif
}
INIT5( ssd, _avx );
-#if ARCH_X86_64
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
- pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_avx;
-#endif
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_avx;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
- pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
- pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx;
+ }
+
+ if( cpu&X264_CPU_XOP )
+ {
+ INIT7( satd, _xop );
+ INIT7( satd_x3, _xop );
+ INIT7( satd_x4, _xop );
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ {
+ INIT4( hadamard_ac, _xop );
+ pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop;
+ }
+ INIT5( ssd, _xop );
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
}
#endif //HAVE_MMX
diff --git a/common/pixel.h b/common/pixel.h
index 41f6d05..cb09fe5 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -132,11 +132,14 @@ typedef struct
void (*intra_mbcmp_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] );
void (*intra_sa8d_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] );
void (*intra_sad_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] );
- /* find minimum satd or sad of all modes.
+ /* find minimum satd or sad of all modes, and set fdec.
* may be NULL, in which case just use pred+satd instead. */
int (*intra_mbcmp_x9_4x4)( pixel *fenc, pixel *fdec, uint16_t *bitcosts );
int (*intra_satd_x9_4x4) ( pixel *fenc, pixel *fdec, uint16_t *bitcosts );
int (*intra_sad_x9_4x4) ( pixel *fenc, pixel *fdec, uint16_t *bitcosts );
+ int (*intra_mbcmp_x9_8x8)( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds );
+ int (*intra_sa8d_x9_8x8) ( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds );
+ int (*intra_sad_x9_8x8) ( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds );
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
diff --git a/common/predict.c b/common/predict.c
index ca20eac..646b587 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -634,6 +634,7 @@ static void x264_predict_8x8_filter_c( pixel *src, pixel edge[36], int i_neighbo
edge[14] = ((have_lt ? SRC(-1,-1) : SRC(-1,0))
+ 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2;
PL(1) PL(2) PL(3) PL(4) PL(5) PL(6)
+ edge[6] =
edge[7] = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2;
}
diff --git a/common/quant.c b/common/quant.c
index 9ed011a..9b6b6d8 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -435,13 +435,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_mmx2;
- pf->coeff_last4 = x264_coeff_last4_mmx2;
+ pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
+ pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
+ pf->coeff_last4 = x264_coeff_last4_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
@@ -464,17 +466,21 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
}
+ pf->coeff_last8 = x264_coeff_last8_sse2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
+ pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
+ pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
+ pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
}
@@ -506,6 +512,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->denoise_dct = x264_denoise_dct_avx;
}
+ if( cpu&X264_CPU_XOP )
+ {
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_xop;
+ if( h->param.i_cqm_preset != X264_CQM_FLAT )
+ {
+ pf->dequant_4x4 = x264_dequant_4x4_xop;
+ pf->dequant_8x8 = x264_dequant_8x8_xop;
+ }
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
@@ -546,11 +561,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
pf->coeff_last4 = x264_coeff_last4_mmx2;
+ pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
+ pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
+ pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
+ pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
}
}
@@ -629,6 +648,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
+
+ if( cpu&X264_CPU_XOP )
+ {
+ if( h->param.i_cqm_preset != X264_CQM_FLAT )
+ {
+ pf->dequant_4x4 = x264_dequant_4x4_xop;
+ pf->dequant_8x8 = x264_dequant_8x8_xop;
+ }
+ }
#endif // HAVE_MMX
#if HAVE_ALTIVEC
diff --git a/common/rectangle.c b/common/rectangle.c
index 1bb4f52..a59f0de 100644
--- a/common/rectangle.c
+++ b/common/rectangle.c
@@ -26,7 +26,7 @@
#include "common.h"
#define CACHE_FUNC(name,size,width,height)\
-void x264_macroblock_cache_##name##_##width##_##height( void *target, uint32_t val )\
+static void x264_macroblock_cache_##name##_##width##_##height( void *target, uint32_t val )\
{\
x264_macroblock_cache_rect( target, width*size, height, size, val );\
}
diff --git a/common/set.c b/common/set.c
index b958c7f..ce73d70 100644
--- a/common/set.c
+++ b/common/set.c
@@ -85,7 +85,7 @@ int x264_cqm_init( x264_t *h )
int max_qp_err = -1;
int max_chroma_qp_err = -1;
int min_qp_err = QP_MAX+1;
- int num_8x8_lists = CHROMA444 ? 4 : 2;
+ int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4 : 2; /* Checkasm may segfault if optimized out by --chroma-format */
for( int i = 0; i < 4 + num_8x8_lists; i++ )
{
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index cdeb482..824def1 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -30,20 +30,14 @@
%include "x86inc.asm"
%include "x86util.asm"
-%macro SHUFFLE_16BIT 8
- %rep 8
- db %1*2
- db %1*2+1
- %rotate 1
- %endrep
-%endmacro
-
SECTION_RODATA
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
-pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
-pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
+pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
+pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
+pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
+pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
@@ -1098,6 +1092,16 @@ INIT_XMM ssse3
SCAN_4x4_FRAME
INIT_XMM avx
SCAN_4x4_FRAME
+
+INIT_XMM xop
+cglobal zigzag_scan_4x4_frame, 2,2
+ mova m0, [r1+ 0]
+ mova m1, [r1+16]
+ vpperm m2, m0, m1, [pb_scan4frame2a]
+ vpperm m1, m0, m1, [pb_scan4frame2b]
+ mova [r0+ 0], m2
+ mova [r0+16], m1
+ RET
%endif ; !HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 619254d..6101504 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -88,6 +88,7 @@ void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index 9b197a1..cc25f64 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -1881,6 +1881,48 @@ INIT_MMX mmx2
DEBLOCK_CHROMA
%endif
+%macro DEBLOCK_H_CHROMA_422 0
+cglobal deblock_h_chroma_422, 5,7,8
+%ifdef ARCH_X86_64
+ %define cntr r11
+%else
+ %define cntr dword r0m
+%endif
+ dec r2d
+ dec r3d
+ sub r0, 4
+ lea t6, [r1*3]
+ mov t5, r0
+ add r0, t6
+ mov cntr, 32/mmsize
+.skip_prologue:
+ TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+%if mmsize == 16
+ punpcklbw m6, m6
+ punpcklbw m6, m6
+%else
+ pshufw m6, m6, q0000
+%endif
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ lea r0, [r0+r1*(mmsize/2)]
+ lea t5, [t5+r1*(mmsize/2)]
+ add r4, mmsize/8
+ dec cntr
+ jg .skip_prologue
+ REP_RET
+%endmacro
+
+INIT_MMX mmx2
+DEBLOCK_H_CHROMA_422
+INIT_XMM sse2
+DEBLOCK_H_CHROMA_422
+INIT_XMM avx
+DEBLOCK_H_CHROMA_422
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index a1b1cb0..2dd587e 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -1302,9 +1302,10 @@ MC_COPY 16
; void prefetch_fenc( pixel *pix_y, int stride_y,
; pixel *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
-INIT_MMX
+
+%macro PREFETCH_FENC 1
%ifdef ARCH_X86_64
-cglobal prefetch_fenc_mmx2, 5,5
+cglobal prefetch_fenc_%1, 5,5
FIX_STRIDES r1d, r3d
and r4d, 3
mov eax, r4d
@@ -1320,10 +1321,15 @@ cglobal prefetch_fenc_mmx2, 5,5
lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
prefetcht0 [r2]
prefetcht0 [r2+r3]
+%ifidn %1, 422
+ lea r2, [r2+r3*2]
+ prefetcht0 [r2]
+ prefetcht0 [r2+r3]
+%endif
RET
%else
-cglobal prefetch_fenc_mmx2, 0,3
+cglobal prefetch_fenc_%1, 0,3
mov r2, r4m
mov r1, r1m
mov r0, r0m
@@ -1346,13 +1352,24 @@ cglobal prefetch_fenc_mmx2, 0,3
lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
+%ifidn %1, 422
+ lea r0, [r0+r1*2]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+%endif
ret
%endif ; ARCH_X86_64
+%endmacro
+
+INIT_MMX mmx2
+PREFETCH_FENC 420
+PREFETCH_FENC 422
;-----------------------------------------------------------------------------
; void prefetch_ref( pixel *pix, int stride, int parity )
;-----------------------------------------------------------------------------
-cglobal prefetch_ref_mmx2, 3,3
+INIT_MMX mmx2
+cglobal prefetch_ref, 3,3
FIX_STRIDES r1d
dec r2d
and r2d, r1d
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index a0b570e..a11e2c5 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1635,8 +1635,8 @@ FRAME_INIT_LOWRES
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
-INIT_XMM
-cglobal mbtree_propagate_cost_sse2, 7,7,7
+%macro MBTREE 0
+cglobal mbtree_propagate_cost, 7,7,7
add r6d, r6d
lea r0, [r0+r6*2]
add r1, r6
@@ -1660,6 +1660,20 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
pand xmm3, xmm5
punpcklwd xmm1, xmm4
punpcklwd xmm3, xmm4
+%if cpuflag(fma4)
+ cvtdq2ps xmm0, xmm0
+ cvtdq2ps xmm1, xmm1
+ vfmaddps xmm0, xmm0, xmm6, xmm1
+ cvtdq2ps xmm1, xmm2
+ psubd xmm2, xmm3
+ cvtdq2ps xmm2, xmm2
+ rcpps xmm3, xmm1
+ mulps xmm1, xmm3
+ mulps xmm0, xmm2
+ addps xmm2, xmm3, xmm3
+ vfnmaddps xmm3, xmm1, xmm3, xmm2
+ mulps xmm0, xmm3
+%else
cvtdq2ps xmm0, xmm0
mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
cvtdq2ps xmm1, xmm1 ; prop
@@ -1674,11 +1688,19 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
subps xmm3, xmm1 ; 2nd approximation for 1/intra
mulps xmm0, xmm3 ; / intra
+%endif
cvtps2dq xmm0, xmm0
movdqa [r0+r6*2], xmm0
add r6, 8
jl .loop
REP_RET
+%endmacro
+
+INIT_XMM sse2
+MBTREE
+; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
+INIT_XMM fma4
+MBTREE
%macro INT16_TO_FLOAT 1
vpunpckhwd xmm4, xmm%1, xmm7
@@ -1688,7 +1710,8 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
%endmacro
; FIXME: align loads/stores to 16 bytes
-cglobal mbtree_propagate_cost_avx, 7,7,8
+INIT_YMM avx
+cglobal mbtree_propagate_cost, 7,7,8
add r6d, r6d
lea r0, [r0+r6*2]
add r1, r6
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index eaa588e..1700f90 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -86,7 +86,8 @@ void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
-void x264_prefetch_fenc_mmx2( pixel *, int, pixel *, int, int );
+void x264_prefetch_fenc_420_mmx2( pixel *, int, pixel *, int, int );
+void x264_prefetch_fenc_422_mmx2( pixel *, int, pixel *, int, int );
void x264_prefetch_ref_mmx2( pixel *, int, int );
void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
@@ -141,6 +142,8 @@ void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
@@ -515,7 +518,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_MMX2) )
return;
- pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
+ pf->prefetch_fenc_420 = x264_prefetch_fenc_420_mmx2;
+ pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
pf->prefetch_ref = x264_prefetch_ref_mmx2;
pf->plane_copy = x264_plane_copy_mmx2;
@@ -741,4 +745,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_AVX) )
return;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
+
+ if( !(cpu&X264_CPU_FMA4) )
+ return;
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
}
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 759162a..9dc02ae 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -67,6 +67,7 @@ intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9
intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3
intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1
intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1
+intrax9a_lut: db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0
pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007
pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007
intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15
@@ -77,9 +78,46 @@ intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0
intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11
intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8
intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9
-intrax9b_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
+intrax9b_vh1: db 6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3
+intrax9b_vh2: db 6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1
+intrax9b_edge2: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
+intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
+
+intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
+intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
+intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
+intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0
+intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10
+intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11
+intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14
+intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15
+intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
+intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9
+intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10
+intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11
+intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13
+intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12
+intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9
+intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8
+intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14
+intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14
+intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12
+intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12
+intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10
+intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8
+intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10
+intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8
+intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2
+intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0
+intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15
+intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15
+pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003
+pw_s00001111: dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001
+
+transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
+transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
sw_f0: dq 0xfff0, 0
sq_0f: dq 0xffffffff, 0
@@ -175,6 +213,7 @@ cglobal pixel_ssd_%1x%2, 4,5
INIT_MMX mmx2
SSD_ONE 4, 4
SSD_ONE 4, 8
+SSD_ONE 4, 16
SSD_ONE 8, 4
SSD_ONE 8, 8
SSD_ONE 8, 16
@@ -417,6 +456,12 @@ INIT_MMX ssse3
SSD 4, 4
SSD 4, 8
SSD 4, 16
+INIT_XMM xop
+SSD 16, 16
+SSD 8, 8
+SSD 16, 8
+SSD 8, 16
+SSD 8, 4
%assign function_align 16
%endif ; !HIGH_BIT_DEPTH
@@ -654,20 +699,20 @@ SSD_NV12
;-----------------------------------------------------------------------------
; int pixel_var_wxh( uint8_t *, int )
;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal pixel_var_16x16_mmx2, 2,3
+INIT_MMX mmx2
+cglobal pixel_var_16x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
-cglobal pixel_var_8x16_mmx2, 2,3
+cglobal pixel_var_8x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 8, 16
-cglobal pixel_var_8x8_mmx2, 2,3
+cglobal pixel_var_8x8, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 4
@@ -702,6 +747,8 @@ INIT_XMM sse2
VAR
INIT_XMM avx
VAR
+INIT_XMM xop
+VAR
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
@@ -756,14 +803,16 @@ INIT_XMM sse2
VAR
INIT_XMM avx
VAR
+INIT_XMM xop
+VAR
%endif ; !HIGH_BIT_DEPTH
-%macro VAR2_END 0
+%macro VAR2_END 1
HADDW m5, m7
movd r1d, m5
imul r1d, r1d
HADDD m6, m1
- shr r1d, 6
+ shr r1d, %1
movd eax, m6
mov [r4], eax
sub eax, r1d ; sqr - (sum * sum >> shift)
@@ -773,11 +822,11 @@ VAR
;-----------------------------------------------------------------------------
; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
;-----------------------------------------------------------------------------
-INIT_MMX
-cglobal pixel_var2_8x8_mmx2, 5,6
+%macro VAR2_8x8_MMX 2
+cglobal pixel_var2_8x%1, 5,6
FIX_STRIDES r1, r3
VAR_START 0
- mov r5d, 8
+ mov r5d, %1
.loop:
%ifdef HIGH_BIT_DEPTH
mova m0, [r0]
@@ -806,13 +855,19 @@ cglobal pixel_var2_8x8_mmx2, 5,6
add r2, r3
dec r5d
jg .loop
- VAR2_END
- RET
+ VAR2_END %2
+%endmacro
-INIT_XMM
-cglobal pixel_var2_8x8_sse2, 5,6,8
+%ifndef ARCH_X86_64
+INIT_MMX mmx2
+VAR2_8x8_MMX 8, 6
+VAR2_8x8_MMX 16, 7
+%endif
+
+%macro VAR2_8x8_SSE2 2
+cglobal pixel_var2_8x%1, 5,6,8
VAR_START 1
- mov r5d, 4
+ mov r5d, %1/2
.loop:
%ifdef HIGH_BIT_DEPTH
mova m0, [r0]
@@ -838,15 +893,20 @@ cglobal pixel_var2_8x8_sse2, 5,6,8
lea r2, [r2+r3*2*SIZEOF_PIXEL]
dec r5d
jg .loop
- VAR2_END
- RET
+ VAR2_END %2
+%endmacro
+
+INIT_XMM sse2
+VAR2_8x8_SSE2 8, 6
+VAR2_8x8_SSE2 16, 7
%ifndef HIGH_BIT_DEPTH
-cglobal pixel_var2_8x8_ssse3, 5,6,8
+%macro VAR2_8x8_SSSE3 2
+cglobal pixel_var2_8x%1, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul]
- mov r5d, 2
+ mov r5d, %1/4
.loop:
movq m0, [r0]
movq m2, [r2]
@@ -882,8 +942,16 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8
lea r2, [r2+r3*2]
dec r5d
jg .loop
- VAR2_END
- RET
+ VAR2_END %2
+%endmacro
+
+INIT_XMM ssse3
+VAR2_8x8_SSSE3 8, 6
+VAR2_8x8_SSSE3 16, 7
+INIT_XMM xop
+VAR2_8x8_SSSE3 8, 6
+VAR2_8x8_SSSE3 16, 7
+
%endif ; !HIGH_BIT_DEPTH
;=============================================================================
@@ -1159,6 +1227,17 @@ cglobal pixel_satd_8x4, 4,6
call pixel_satd_8x4_internal_mmx2
SATD_END_MMX
+cglobal pixel_satd_4x16, 4,6
+ SATD_START_MMX
+ SATD_4x4_MMX m0, 0, 1
+ SATD_4x4_MMX m1, 0, 1
+ paddw m0, m1
+ SATD_4x4_MMX m1, 0, 1
+ paddw m0, m1
+ SATD_4x4_MMX m1, 0, 0
+ paddw m0, m1
+ SATD_END_MMX
+
cglobal pixel_satd_4x8, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
@@ -1205,32 +1284,7 @@ cglobal pixel_satd_4x4, 4,6
%endif
%endmacro
-;-----------------------------------------------------------------------------
-; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-%macro SATDS_SSE2 0
-%if cpuflag(ssse3)
-cglobal pixel_satd_4x4, 4, 6, 6
- SATD_START_MMX
- mova m4, [hmul_4p]
- LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
- LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
- LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
- LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
- DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
- HADAMARD 0, sumsub, 0, 1, 2, 3
- HADAMARD 4, sumsub, 0, 1, 2, 3
- HADAMARD 1, amax, 0, 1, 2, 3
- HADDW m0, m1
- movd eax, m0
- RET
-%endif
-
-cglobal pixel_satd_4x8, 4, 6, 8
- SATD_START_MMX
-%if cpuflag(ssse3)
- mova m7, [hmul_4p]
-%endif
+%macro SATD_4x8_SSE 2
movd m4, [r2]
movd m5, [r2+r3]
movd m6, [r2+2*r3]
@@ -1247,7 +1301,12 @@ cglobal pixel_satd_4x8, 4, 6, 8
JDUP m5, m3
movd m3, [r0+2*r1]
JDUP m1, m3
+%if cpuflag(ssse3) && %1==1
+ mova m3, [hmul_4p]
+ DIFFOP 0, 4, 1, 5, 3
+%else
DIFFOP 0, 4, 1, 5, 7
+%endif
movd m5, [r2]
add r2, r5
movd m3, [r0]
@@ -1260,10 +1319,57 @@ cglobal pixel_satd_4x8, 4, 6, 8
JDUP m5, m4
movd m4, [r0+r1]
JDUP m3, m4
+%if cpuflag(ssse3) && %1==1
+ mova m4, [hmul_4p]
+ DIFFOP 2, 6, 3, 5, 4
+%else
DIFFOP 2, 6, 3, 5, 7
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6, swap
- HADDW m6, m1
- movd eax, m6
+%endif
+ SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+%macro SATDS_SSE2 0
+%if cpuflag(ssse3)
+cglobal pixel_satd_4x4, 4, 6, 6
+ SATD_START_MMX
+ mova m4, [hmul_4p]
+ LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
+ LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
+ LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
+ LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
+ DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
+ HADAMARD 0, sumsub, 0, 1, 2, 3
+ HADAMARD 4, sumsub, 0, 1, 2, 3
+ HADAMARD 1, amax, 0, 1, 2, 3
+ HADDW m0, m1
+ movd eax, m0
+ RET
+%endif
+
+cglobal pixel_satd_4x8, 4, 6, 8
+ SATD_START_MMX
+%if cpuflag(ssse3)
+ mova m7, [hmul_4p]
+%endif
+ SATD_4x8_SSE 0, swap
+ HADDW m7, m1
+ movd eax, m7
+ RET
+
+cglobal pixel_satd_4x16, 4, 6, 8
+ SATD_START_MMX
+%if cpuflag(ssse3)
+ mova m7, [hmul_4p]
+%endif
+ SATD_4x8_SSE 0, swap
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ SATD_4x8_SSE 1, add
+ HADDW m7, m1
+ movd eax, m7
RET
cglobal pixel_satd_8x8_internal
@@ -1374,18 +1480,7 @@ cglobal pixel_sa8d_8x8_internal
%if vertical
HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
%else ; non-sse2
- HADAMARD4_V 0, 1, 2, 8, 6
- HADAMARD4_V 4, 5, 3, 9, 6
- SUMSUB_BADC w, 0, 4, 1, 5, 6
- HADAMARD 2, sumsub, 0, 4, 6, 11
- HADAMARD 2, sumsub, 1, 5, 6, 11
- SUMSUB_BADC w, 2, 3, 8, 9, 6
- HADAMARD 2, sumsub, 2, 3, 6, 11
- HADAMARD 2, sumsub, 8, 9, 6, 11
- HADAMARD 1, amax, 0, 4, 6, 11
- HADAMARD 1, amax, 1, 5, 6, 4
- HADAMARD 1, amax, 2, 3, 6, 4
- HADAMARD 1, amax, 8, 9, 6, 4
+ HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
%endif
paddw m0, m1
paddw m0, m2
@@ -1590,12 +1685,14 @@ cglobal pixel_sa8d_16x16, 4,7
paddw %3, %5
%endmacro
+; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
+; and are only retained for old cpus.
%macro INTRA_SA8D_SSE2 0
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
-cglobal intra_sa8d_x3_8x8, 3,3,16
+cglobal intra_sa8d_x3_8x8, 3,3,14
; 8x8 hadamard
pxor m8, m8
movq m0, [r0+0*FENC_STRIDE]
@@ -1622,23 +1719,15 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
paddusw m8, m10
paddusw m9, m11
ABSW2 m10, m11, m6, m7, m6, m7
- ABSW m15, m1, m1
+ ABSW m13, m1, m1
paddusw m10, m11
paddusw m8, m9
- paddusw m15, m10
- paddusw m15, m8
+ paddusw m13, m10
+ paddusw m13, m8
; 1D hadamard of edges
movq m8, [r1+7]
movq m9, [r1+16]
-%if cpuflag(ssse3)
- punpcklwd m8, m8
- pshufb m9, [intrax3_shuf]
- pmaddubsw m8, [pb_pppm]
- pmaddubsw m9, [pb_pppm]
- HSUMSUB2 psignw, m8, m9, m10, m11, m9, q1032, [pw_ppppmmmm]
- HSUMSUB2 psignw, m8, m9, m10, m11, m9, q2301, [pw_ppmmppmm]
-%else ; sse2
pxor m10, m10
punpcklbw m8, m10
punpcklbw m9, m10
@@ -1652,7 +1741,6 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
pmullw m11, [pw_pmpmpmpm]
paddw m8, m10
paddw m9, m11
-%endif
; differences
paddw m10, m8, m9
@@ -1664,8 +1752,8 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
psubw m8, m0
psubw m10, m0
ABSW2 m8, m10, m8, m10, m11, m12 ; 1x8 sum
- paddusw m14, m8, m15
- paddusw m15, m10
+ paddusw m8, m13
+ paddusw m13, m10
punpcklwd m0, m1
punpcklwd m2, m3
punpcklwd m4, m5
@@ -1674,7 +1762,7 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
punpckldq m4, m6
punpcklqdq m0, m4 ; transpose
psllw m9, 3 ; top edge
- psrldq m2, m15, 2 ; 8x7 sum
+ psrldq m2, m13, 2 ; 8x7 sum
psubw m0, m9 ; 8x1 sum
ABSW m0, m0, m9
paddusw m2, m0
@@ -1682,21 +1770,21 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
; 3x HADDW
movdqa m7, [pw_1]
pmaddwd m2, m7
- pmaddwd m14, m7
- pmaddwd m15, m7
- punpckhdq m3, m2, m14
- punpckldq m2, m14
- pshufd m5, m15, q3311
+ pmaddwd m8, m7
+ pmaddwd m13, m7
+ punpckhdq m3, m2, m8
+ punpckldq m2, m8
+ pshufd m5, m13, q3311
paddd m2, m3
- paddd m5, m15
- punpckhqdq m3, m2, m5
+ paddd m5, m13
+ punpckhqdq m0, m2, m5
punpcklqdq m2, m5
- pavgw m3, m2
- pxor m0, m0
- pavgw m3, m0
- movq [r2], m3 ; i8x8_v, i8x8_h
- psrldq m3, 8
- movd [r2+8], m3 ; i8x8_dc
+ pavgw m0, m2
+ pxor m1, m1
+ pavgw m0, m1
+ movq [r2], m0 ; i8x8_v, i8x8_h
+ psrldq m0, 8
+ movd [r2+8], m0 ; i8x8_dc
RET
%endif ; ARCH_X86_64
%endmacro ; INTRA_SA8D_SSE2
@@ -2021,12 +2109,20 @@ cglobal intra_satd_x3_8x8c, 0,6
%macro PRED4x4_LOWPASS 5
+%ifid %5
+ pavgb %5, %2, %3
+ pxor %3, %2
+ pand %3, [pb_1]
+ psubusb %5, %3
+ pavgb %1, %4, %5
+%else
mova %5, %2
pavgb %2, %3
pxor %3, %5
pand %3, [pb_1]
psubusb %2, %3
pavgb %1, %4, %2
+%endif
%endmacro
%macro INTRA_X9_PRED 2
@@ -2081,18 +2177,20 @@ cglobal intra_satd_x3_8x8c, 0,6
%endmacro ; INTRA_X9_PRED
%macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
- pshufb m%1, [intrax9b_vh1] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
+ pshufb m2, m%1, [intrax9b_vh1]
+ pshufb m3, m%1, [intrax9b_vh2]
+ mova [pred_buf+0x60], m2
+ mova [pred_buf+0x70], m3
+ pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
pmaddubsw m%1, [hmul_4p]
pshufhw m0, m%1, q2301
pshuflw m0, m0, q2301
psignw m%1, [pw_pmpmpmpm]
paddw m0, m%1
psllw m0, 2 ; hadamard(top), hadamard(left)
- mova m1, m0
- mova m2, m0
movhlps m3, m0
- pshufb m1, [intrax9b_v1]
- pshufb m2, [intrax9b_v2]
+ pshufb m1, m0, [intrax9b_v1]
+ pshufb m2, m0, [intrax9b_v2]
paddw m0, m3
psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
pavgw m0, [pw_16]
@@ -2102,6 +2200,13 @@ cglobal intra_satd_x3_8x8c, 0,6
; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
HADAMARD 0, sumsub, %2, %3, %4, %5
HADAMARD 1, sumsub, %2, %3, %4, %5
+ movd r3d, m0
+ shr r3d, 4
+ imul r3d, 0x01010101
+ mov [pred_buf+0x80], r3d
+ mov [pred_buf+0x88], r3d
+ mov [pred_buf+0x90], r3d
+ mov [pred_buf+0x98], r3d
psubw m3, m%2
psubw m0, m%2
psubw m1, m%2
@@ -2122,17 +2227,23 @@ cglobal intra_satd_x3_8x8c, 0,6
%endif
movhlps m2, m1
paddw m1, m2
+%if cpuflag(xop)
+ vphaddwq m3, m3
+ vphaddwq m1, m1
+ packssdw m1, m3
+%else
phaddw m1, m3
pmaddwd m1, [pw_1] ; v, _, h, dc
+%endif
%endmacro ; INTRA_X9_VHDC
-%macro INTRA_X9_END 1
+%macro INTRA_X9_END 2
%if cpuflag(sse4)
phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
movd eax, m0
add eax, 1<<16
- cmp ax, r1w
- cmovge eax, r1d
+ cmp ax, r3w
+ cmovge eax, r3d
%else
%if %1
; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
@@ -2158,22 +2269,58 @@ cglobal intra_satd_x3_8x8c, 0,6
; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
; 1<<12: undo sign manipulation
lea eax, [rax+r2+(1<<16)+(1<<12)]
- cmp ax, r1w
- cmovge eax, r1d
+ cmp ax, r3w
+ cmovge eax, r3d
%endif ; cpuflag
+
+ ; output the predicted samples
+ mov r3d, eax
+ shr r3d, 16
+%ifdef PIC
+ lea r2, [%2_lut]
+ movzx r2d, byte [r2+r3]
+%else
+ movzx r2d, byte [%2_lut+r3]
+%endif
+%if %1 ; sad
+ movq mm0, [pred_buf+r2]
+ movq mm1, [pred_buf+r2+16]
+ movd [r1+0*FDEC_STRIDE], mm0
+ movd [r1+2*FDEC_STRIDE], mm1
+ psrlq mm0, 32
+ psrlq mm1, 32
+ movd [r1+1*FDEC_STRIDE], mm0
+ movd [r1+3*FDEC_STRIDE], mm1
+%else ; satd
+%assign i 0
+%rep 4
+ mov r3d, [pred_buf+r2+8*i]
+ mov [r1+i*FDEC_STRIDE], r3d
+%assign i i+1
+%endrep
+%endif
%endmacro ; INTRA_X9_END
%macro INTRA_X9 0
;-----------------------------------------------------------------------------
; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
;-----------------------------------------------------------------------------
-cglobal intra_sad_x9_4x4, 3,3,9
+%if notcpuflag(xop)
+cglobal intra_sad_x9_4x4, 3,4,9
+ %assign pad 0xc0-gprsize-(stack_offset&15)
+ %define pred_buf rsp
+ sub rsp, pad
%ifdef ARCH_X86_64
INTRA_X9_PRED intrax9a, m8
%else
- sub rsp, 0x1c
- INTRA_X9_PRED intrax9a, [rsp]
+ INTRA_X9_PRED intrax9a, [rsp+0xa0]
%endif
+ mova [rsp+0x00], m2
+ mova [rsp+0x10], m3
+ mova [rsp+0x20], m4
+ mova [rsp+0x30], m5
+ mova [rsp+0x40], m6
+ mova [rsp+0x50], m7
%if cpuflag(sse4)
movd m0, [r0+0*FENC_STRIDE]
pinsrd m0, [r0+1*FENC_STRIDE], 1
@@ -2203,27 +2350,29 @@ cglobal intra_sad_x9_4x4, 3,3,9
pxor m8, m8
%define %%zero m8
%else
- mova m7, [rsp]
+ mova m7, [rsp+0xa0]
%define %%zero [pb_0]
%endif
- mova m3, m7
- mova m5, m7
+ pshufb m3, m7, [intrax9a_vh1]
+ pshufb m5, m7, [intrax9a_vh2]
pshufb m7, [intrax9a_dc]
- pshufb m3, [intrax9a_vh1]
psadbw m7, %%zero
- pshufb m5, [intrax9a_vh2]
psrlw m7, 2
+ mova [rsp+0x60], m3
+ mova [rsp+0x70], m5
psadbw m3, m0
pavgw m7, %%zero
pshufb m7, %%zero
psadbw m5, m1
+ movq [rsp+0x80], m7
+ movq [rsp+0x90], m7
psadbw m0, m7
paddd m3, m5
psadbw m1, m7
paddd m0, m1
- movzx r1d, word [r2]
+ movzx r3d, word [r2]
movd r0d, m3 ; v
- add r1d, r0d
+ add r3d, r0d
punpckhqdq m3, m0 ; h, dc
shufps m3, m2, q2020
psllq m6, 32
@@ -2231,18 +2380,26 @@ cglobal intra_sad_x9_4x4, 3,3,9
movu m0, [r2+2]
packssdw m3, m4
paddw m0, m3
- INTRA_X9_END 1
-%ifndef ARCH_X86_64
- add rsp, 0x1c
-%endif
+ INTRA_X9_END 1, intrax9a
+ add rsp, pad
RET
+%endif ; cpuflag
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
;-----------------------------------------------------------------------------
-cglobal intra_satd_x9_4x4, 3,3,16
+cglobal intra_satd_x9_4x4, 3,4,16
+ %assign pad 0xb0-gprsize-(stack_offset&15)
+ %define pred_buf rsp
+ sub rsp, pad
INTRA_X9_PRED intrax9b, m15
+ mova [rsp+0x00], m2
+ mova [rsp+0x10], m3
+ mova [rsp+0x20], m4
+ mova [rsp+0x30], m5
+ mova [rsp+0x40], m6
+ mova [rsp+0x50], m7
movd m8, [r0+0*FENC_STRIDE]
movd m9, [r0+1*FENC_STRIDE]
movd m10, [r0+2*FENC_STRIDE]
@@ -2286,7 +2443,7 @@ cglobal intra_satd_x9_4x4, 3,3,16
INTRA_X9_VHDC 15, 8, 10, 6, 7
; find minimum
movu m0, [r2+2]
- movd r1d, m1
+ movd r3d, m1
palignr m5, m1, 8
%if notcpuflag(sse4)
pshufhw m0, m0, q3120 ; compensate for different order in unpack
@@ -2294,8 +2451,9 @@ cglobal intra_satd_x9_4x4, 3,3,16
packssdw m5, m4
paddw m0, m5
movzx r0d, word [r2]
- add r1d, r0d
- INTRA_X9_END 0
+ add r3d, r0d
+ INTRA_X9_END 0, intrax9b
+ add rsp, pad
RET
RESET_MM_PERMUTATION
ALIGN 16
@@ -2319,13 +2477,19 @@ ALIGN 16
ret
%else ; !ARCH_X86_64
-cglobal intra_satd_x9_4x4, 3,3,8
- sub rsp, 0x9c
- INTRA_X9_PRED intrax9b, [rsp+0x80]
- mova [rsp+0x40], m4
- mova [rsp+0x50], m5
- mova [rsp+0x60], m6
- mova [rsp+0x70], m7
+cglobal intra_satd_x9_4x4, 3,4,8
+ %assign pad 0x120-gprsize-(stack_offset&15)
+ %define fenc_buf rsp
+ %define pred_buf rsp+0x40
+ %define spill rsp+0xe0
+ sub rsp, pad
+ INTRA_X9_PRED intrax9b, [spill+0x20]
+ mova [pred_buf+0x00], m2
+ mova [pred_buf+0x10], m3
+ mova [pred_buf+0x20], m4
+ mova [pred_buf+0x30], m5
+ mova [pred_buf+0x40], m6
+ mova [pred_buf+0x50], m7
movd m4, [r0+0*FENC_STRIDE]
movd m5, [r0+1*FENC_STRIDE]
movd m6, [r0+2*FENC_STRIDE]
@@ -2339,10 +2503,10 @@ cglobal intra_satd_x9_4x4, 3,3,8
pmaddubsw m5, m7
pmaddubsw m6, m7
pmaddubsw m0, m7
- mova [rsp+0x00], m4
- mova [rsp+0x10], m5
- mova [rsp+0x20], m6
- mova [rsp+0x30], m0
+ mova [fenc_buf+0x00], m4
+ mova [fenc_buf+0x10], m5
+ mova [fenc_buf+0x20], m6
+ mova [fenc_buf+0x30], m0
movddup m0, m2
pshufd m1, m2, q3232
movddup m2, m3
@@ -2355,49 +2519,47 @@ cglobal intra_satd_x9_4x4, 3,3,8
psubw m1, m5
psubw m2, m6
call .satd_8x4b ; ddr, ddl
- mova m3, [rsp+0x50]
- mova m1, [rsp+0x40]
+ mova m3, [pred_buf+0x30]
+ mova m1, [pred_buf+0x20]
movddup m2, m3
movhlps m3, m3
- movq [rsp+0x48], m0
+ movq [spill+0x08], m0
movddup m0, m1
movhlps m1, m1
call .satd_8x4 ; vr, vl
- mova m3, [rsp+0x70]
- mova m1, [rsp+0x60]
+ mova m3, [pred_buf+0x50]
+ mova m1, [pred_buf+0x40]
movddup m2, m3
movhlps m3, m3
- movq [rsp+0x50], m0
+ movq [spill+0x10], m0
movddup m0, m1
movhlps m1, m1
call .satd_8x4 ; hd, hu
- movq [rsp+0x58], m0
- mova m1, [rsp+0x80]
- mova m4, [rsp+0x00]
- mova m5, [rsp+0x20]
+ movq [spill+0x18], m0
+ mova m1, [spill+0x20]
+ mova m4, [fenc_buf+0x00]
+ mova m5, [fenc_buf+0x20]
mova m2, [pw_ppmmppmm]
psignw m4, m2
psignw m5, m2
- paddw m4, [rsp+0x10]
- paddw m5, [rsp+0x30]
+ paddw m4, [fenc_buf+0x10]
+ paddw m5, [fenc_buf+0x30]
INTRA_X9_VHDC 1, 4, 5, 6, 7
; find minimum
movu m0, [r2+2]
- movd r1d, m1
- movhlps m1, m1
- movhps m1, [rsp+0x48]
+ movd r3d, m1
+ punpckhqdq m1, [spill+0x00]
+ packssdw m1, [spill+0x10]
%if cpuflag(sse4)
- pshufd m2, [rsp+0x50], q3120
- packssdw m1, m2
+ pshufhw m1, m1, q3120
%else
- packssdw m1, [rsp+0x50]
pshufhw m0, m0, q3120
%endif
paddw m0, m1
movzx r0d, word [r2]
- add r1d, r0d
- INTRA_X9_END 0
- add rsp, 0x9c
+ add r3d, r0d
+ INTRA_X9_END 0, intrax9b
+ add rsp, pad
RET
RESET_MM_PERMUTATION
ALIGN 16
@@ -2406,11 +2568,12 @@ ALIGN 16
pmaddubsw m1, m7
pmaddubsw m2, m7
pmaddubsw m3, m7
- psubw m0, [rsp+0x00+gprsize]
- psubw m1, [rsp+0x10+gprsize]
- psubw m2, [rsp+0x20+gprsize]
+ %xdefine fenc_buf fenc_buf+gprsize
+ psubw m0, [fenc_buf+0x00]
+ psubw m1, [fenc_buf+0x10]
+ psubw m2, [fenc_buf+0x20]
.satd_8x4b:
- psubw m3, [rsp+0x30+gprsize]
+ psubw m3, [fenc_buf+0x30]
SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap
pmaddwd m0, [pw_1]
%if cpuflag(sse4)
@@ -2425,6 +2588,668 @@ ALIGN 16
+%macro INTRA8_X9 0
+;-----------------------------------------------------------------------------
+; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
+;-----------------------------------------------------------------------------
+cglobal intra_sad_x9_8x8, 5,6,9
+ %define fenc02 m4
+ %define fenc13 m5
+ %define fenc46 m6
+ %define fenc57 m7
+%ifdef ARCH_X86_64
+ %define tmp m8
+ %assign padbase 0x0
+%else
+ %define tmp [rsp]
+ %assign padbase 0x10
+%endif
+ %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15)
+ %define pred(i,j) [rsp+i*0x40+j*0x10+padbase]
+
+ SUB rsp, pad
+ movq fenc02, [r0+FENC_STRIDE* 0]
+ movq fenc13, [r0+FENC_STRIDE* 1]
+ movq fenc46, [r0+FENC_STRIDE* 4]
+ movq fenc57, [r0+FENC_STRIDE* 5]
+ movhps fenc02, [r0+FENC_STRIDE* 2]
+ movhps fenc13, [r0+FENC_STRIDE* 3]
+ movhps fenc46, [r0+FENC_STRIDE* 6]
+ movhps fenc57, [r0+FENC_STRIDE* 7]
+
+ ; save instruction size: avoid 4-byte memory offsets
+ lea r0, [intra8x9_h1+128]
+ %define off(m) (r0+m-(intra8x9_h1+128))
+
+; v
+ movddup m0, [r2+16]
+ mova pred(0,0), m0
+ psadbw m1, m0, fenc02
+ mova pred(0,1), m0
+ psadbw m2, m0, fenc13
+ mova pred(0,2), m0
+ psadbw m3, m0, fenc46
+ mova pred(0,3), m0
+ psadbw m0, m0, fenc57
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+ movd [r4+0], m0
+
+; h
+ movq m0, [r2+7]
+ pshufb m1, m0, [off(intra8x9_h1)]
+ pshufb m2, m0, [off(intra8x9_h2)]
+ mova pred(1,0), m1
+ psadbw m1, fenc02
+ mova pred(1,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m3, m0, [off(intra8x9_h3)]
+ pshufb m2, m0, [off(intra8x9_h4)]
+ mova pred(1,2), m3
+ psadbw m3, fenc46
+ mova pred(1,3), m2
+ psadbw m2, fenc57
+ paddw m1, m3
+ paddw m1, m2
+ movhlps m2, m1
+ paddw m1, m2
+ movd [r4+2], m1
+
+ lea r5, [rsp+padbase+0x100]
+ %define pred(i,j) [r5+i*0x40+j*0x10-0x100]
+
+; dc
+ movhps m0, [r2+16]
+ pxor m2, m2
+ psadbw m0, m2
+ movhlps m1, m0
+ paddw m0, m1
+ psrlw m0, 3
+ pavgw m0, m2
+ pshufb m0, m2
+ mova pred(2,0), m0
+ psadbw m1, m0, fenc02
+ mova pred(2,1), m0
+ psadbw m2, m0, fenc13
+ mova pred(2,2), m0
+ psadbw m3, m0, fenc46
+ mova pred(2,3), m0
+ psadbw m0, m0, fenc57
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+ movd [r4+4], m0
+
+; ddl
+; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
+; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
+; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
+; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
+; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC
+; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD
+; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE
+; Ft8 Ft9 FtA FtB FtC FtD FtE FtF
+ mova m0, [r2+16]
+ movu m2, [r2+17]
+ pslldq m1, m0, 1
+ pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___
+ PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF
+ pshufb m1, m0, [off(intra8x9_ddl1)]
+ pshufb m2, m0, [off(intra8x9_ddl2)]
+ mova pred(3,0), m1
+ psadbw m1, fenc02
+ mova pred(3,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_ddl3)]
+ mova pred(3,2), m2
+ psadbw m2, fenc46
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_ddl4)]
+ mova pred(3,3), m2
+ psadbw m2, fenc57
+ paddw m1, m2
+ movhlps m2, m1
+ paddw m1, m2
+ movd [r4+6], m1
+
+; vl
+; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8
+; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
+; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9
+; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
+; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA
+; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
+; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB
+; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
+ pshufb m1, m3, [off(intra8x9_vl1)]
+ pshufb m2, m0, [off(intra8x9_vl2)]
+ pshufb m3, m3, [off(intra8x9_vl3)]
+ pshufb m0, m0, [off(intra8x9_vl4)]
+ mova pred(7,0), m1
+ psadbw m1, fenc02
+ mova pred(7,1), m2
+ psadbw m2, fenc13
+ mova pred(7,2), m3
+ psadbw m3, fenc46
+ mova pred(7,3), m0
+ psadbw m0, fenc57
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+ movhlps m1, m0
+ paddw m0, m1
+%if cpuflag(sse4)
+ pextrw [r4+14], m0, 0
+%else
+ movd r5d, m0
+ mov [r4+14], r5w
+ lea r5, [rsp+padbase+0x100]
+%endif
+
+; ddr
+; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
+; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
+; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4
+; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3
+; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2
+; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1
+; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0
+; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt
+ movu m2, [r2+8]
+ movu m0, [r2+7]
+ movu m1, [r2+6]
+ pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
+ PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
+ pshufb m1, m0, [off(intra8x9_ddr1)]
+ pshufb m2, m0, [off(intra8x9_ddr2)]
+ mova pred(4,0), m1
+ psadbw m1, fenc02
+ mova pred(4,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_ddr3)]
+ mova pred(4,2), m2
+ psadbw m2, fenc46
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_ddr4)]
+ mova pred(4,3), m2
+ psadbw m2, fenc57
+ paddw m1, m2
+ movhlps m2, m1
+ paddw m1, m2
+ movd [r4+8], m1
+
+ add r0, 256
+ add r5, 0xC0
+ %define off(m) (r0+m-(intra8x9_h1+256+128))
+ %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0]
+
+; vr
+; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
+; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
+; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6
+; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
+; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
+; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4
+; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4
+; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3
+ movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
+ pshufb m1, m2, [off(intra8x9_vr1)]
+ pshufb m2, m2, [off(intra8x9_vr3)]
+ mova pred(5,0), m1
+ psadbw m1, fenc02
+ mova pred(5,2), m2
+ psadbw m2, fenc46
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_vr2)]
+ mova pred(5,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_vr4)]
+ mova pred(5,3), m2
+ psadbw m2, fenc57
+ paddw m1, m2
+ movhlps m2, m1
+ paddw m1, m2
+ movd [r4+10], m1
+
+; hd
+; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
+; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3
+; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1
+; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt
+; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0
+; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1
+; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2
+; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3
+ pshufd m2, m3, q0001
+%if cpuflag(sse4)
+ pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___
+%else
+ movss m1, m0, m2
+ SWAP 1, 2
+%endif
+ punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___
+ pshufb m1, m2, [off(intra8x9_hd1)]
+ pshufb m2, m2, [off(intra8x9_hd2)]
+ mova pred(6,0), m1
+ psadbw m1, fenc02
+ mova pred(6,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_hd3)]
+ pshufb m3, m0, [off(intra8x9_hd4)]
+ mova pred(6,2), m2
+ psadbw m2, fenc46
+ mova pred(6,3), m3
+ psadbw m3, fenc57
+ paddw m1, m2
+ paddw m1, m3
+ movhlps m2, m1
+ paddw m1, m2
+ ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
+ pslldq m1, 12
+ SWAP 3, 1
+
+; hu
+; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4
+; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5
+; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6
+; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7
+; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7
+; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7
+; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
+; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
+%if cpuflag(sse4)
+ pinsrb m0, [r2+7], 15 ; Gl7
+%else
+ movd m1, [r2+7]
+ pslldq m0, 1
+ palignr m1, m0, 1
+ SWAP 0, 1
+%endif
+ pshufb m1, m0, [off(intra8x9_hu1)]
+ pshufb m2, m0, [off(intra8x9_hu2)]
+ mova pred(8,0), m1
+ psadbw m1, fenc02
+ mova pred(8,1), m2
+ psadbw m2, fenc13
+ paddw m1, m2
+ pshufb m2, m0, [off(intra8x9_hu3)]
+ pshufb m0, m0, [off(intra8x9_hu4)]
+ mova pred(8,2), m2
+ psadbw m2, fenc46
+ mova pred(8,3), m0
+ psadbw m0, fenc57
+ paddw m1, m2
+ paddw m1, m0
+ movhlps m2, m1
+ paddw m1, m2
+ movd r2d, m1
+
+ movu m0, [r3]
+ por m3, [r4]
+ paddw m0, m3
+ mova [r4], m0
+ movzx r5d, word [r3+16]
+ add r2d, r5d
+ mov [r4+16], r2w
+
+%if cpuflag(sse4)
+ phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl
+ movd eax, m0
+%else
+ ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index
+ paddusw m0, m0
+ paddusw m0, m0
+ paddw m0, [off(pw_s00112233)]
+ movhlps m1, m0
+ pminsw m0, m1
+ pshuflw m1, m0, q0032
+ pminsw m0, m1
+ movd eax, m0
+ ; repack with 3 bit index
+ xor eax, 0x80008000
+ movzx r3d, ax
+ shr eax, 15
+ add r3d, r3d
+ or eax, 1
+ cmp eax, r3d
+ cmovg eax, r3d
+ ; reverse to phminposuw order
+ mov r3d, eax
+ and eax, 7
+ shr r3d, 3
+ shl eax, 16
+ or eax, r3d
+%endif
+ add r2d, 8<<16
+ cmp ax, r2w
+ cmovg eax, r2d
+
+ mov r2d, eax
+ shr r2d, 16
+ shl r2d, 6
+ add r1, 4*FDEC_STRIDE
+ mova m0, [rsp+padbase+r2+0x00]
+ mova m1, [rsp+padbase+r2+0x10]
+ mova m2, [rsp+padbase+r2+0x20]
+ mova m3, [rsp+padbase+r2+0x30]
+ movq [r1+FDEC_STRIDE*-4], m0
+ movhps [r1+FDEC_STRIDE*-2], m0
+ movq [r1+FDEC_STRIDE*-3], m1
+ movhps [r1+FDEC_STRIDE*-1], m1
+ movq [r1+FDEC_STRIDE* 0], m2
+ movhps [r1+FDEC_STRIDE* 2], m2
+ movq [r1+FDEC_STRIDE* 1], m3
+ movhps [r1+FDEC_STRIDE* 3], m3
+ ADD rsp, pad
+ RET
+
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
+;-----------------------------------------------------------------------------
+cglobal intra_sa8d_x9_8x8, 5,6,16
+ %assign pad 0x2c0+0x10-gprsize-(stack_offset&15)
+ %define fenc_buf rsp
+ %define pred_buf rsp+0x80
+ SUB rsp, pad
+ mova m15, [hmul_8p]
+ pxor m8, m8
+%assign %%i 0
+%rep 8
+ movddup m %+ %%i, [r0+%%i*FENC_STRIDE]
+ pmaddubsw m9, m %+ %%i, m15
+ punpcklbw m %+ %%i, m8
+ mova [fenc_buf+%%i*0x10], m9
+%assign %%i %%i+1
+%endrep
+
+ ; save instruction size: avoid 4-byte memory offsets
+ lea r0, [intra8x9_h1+0x80]
+ %define off(m) (r0+m-(intra8x9_h1+0x80))
+ lea r5, [pred_buf+0x80]
+
+; v, h, dc
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
+ pabsw m11, m1
+%assign %%i 2
+%rep 6
+ pabsw m8, m %+ %%i
+ paddw m11, m8
+%assign %%i %%i+1
+%endrep
+
+ ; 1D hadamard of edges
+ movq m8, [r2+7]
+ movddup m9, [r2+16]
+ mova [r5-0x80], m9
+ mova [r5-0x70], m9
+ mova [r5-0x60], m9
+ mova [r5-0x50], m9
+ punpcklwd m8, m8
+ pshufb m9, [intrax3_shuf]
+ pmaddubsw m8, [pb_pppm]
+ pmaddubsw m9, [pb_pppm]
+ HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm]
+ HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm]
+
+ ; dc
+ paddw m10, m8, m9
+ paddw m10, [pw_8]
+ pand m10, [sw_f0]
+ psrlw m12, m10, 4
+ psllw m10, 2
+ pxor m13, m13
+ pshufb m12, m13
+ mova [r5+0x00], m12
+ mova [r5+0x10], m12
+ mova [r5+0x20], m12
+ mova [r5+0x30], m12
+
+ ; differences
+ psllw m8, 3 ; left edge
+ psubw m8, m0
+ psubw m10, m0
+ pabsw m8, m8 ; 1x8 sum
+ pabsw m10, m10
+ paddw m8, m11
+ paddw m11, m10
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpcklwd m4, m5
+ punpcklwd m6, m7
+ punpckldq m0, m2
+ punpckldq m4, m6
+ punpcklqdq m0, m4 ; transpose
+ psllw m9, 3 ; top edge
+ psrldq m10, m11, 2 ; 8x7 sum
+ psubw m0, m9 ; 8x1 sum
+ pabsw m0, m0
+ paddw m10, m0
+
+ phaddd m10, m8 ; logically phaddw, but this is faster and it won't overflow
+ psrlw m11, 1
+ psrlw m10, 1
+
+; store h
+ movq m3, [r2+7]
+ pshufb m0, m3, [off(intra8x9_h1)]
+ pshufb m1, m3, [off(intra8x9_h2)]
+ pshufb m2, m3, [off(intra8x9_h3)]
+ pshufb m3, m3, [off(intra8x9_h4)]
+ mova [r5-0x40], m0
+ mova [r5-0x30], m1
+ mova [r5-0x20], m2
+ mova [r5-0x10], m3
+
+; ddl
+ mova m8, [r2+16]
+ movu m2, [r2+17]
+ pslldq m1, m8, 1
+ pavgb m9, m8, m2
+ PRED4x4_LOWPASS m8, m1, m2, m8, m3
+ pshufb m0, m8, [off(intra8x9_ddl1)]
+ pshufb m1, m8, [off(intra8x9_ddl2)]
+ pshufb m2, m8, [off(intra8x9_ddl3)]
+ pshufb m3, m8, [off(intra8x9_ddl4)]
+ add r5, 0x40
+ call .sa8d
+ phaddd m11, m0
+
+; vl
+ pshufb m0, m9, [off(intra8x9_vl1)]
+ pshufb m1, m8, [off(intra8x9_vl2)]
+ pshufb m2, m9, [off(intra8x9_vl3)]
+ pshufb m3, m8, [off(intra8x9_vl4)]
+ add r5, 0x100
+ call .sa8d
+ phaddd m10, m11
+ mova m12, m0
+
+; ddr
+ movu m2, [r2+8]
+ movu m8, [r2+7]
+ movu m1, [r2+6]
+ pavgb m9, m2, m8
+ PRED4x4_LOWPASS m8, m1, m2, m8, m3
+ pshufb m0, m8, [off(intra8x9_ddr1)]
+ pshufb m1, m8, [off(intra8x9_ddr2)]
+ pshufb m2, m8, [off(intra8x9_ddr3)]
+ pshufb m3, m8, [off(intra8x9_ddr4)]
+ sub r5, 0xc0
+ call .sa8d
+ mova m11, m0
+
+ add r0, 0x100
+ %define off(m) (r0+m-(intra8x9_h1+0x180))
+
+; vr
+ movsd m2, m9, m8
+ pshufb m0, m2, [off(intra8x9_vr1)]
+ pshufb m1, m8, [off(intra8x9_vr2)]
+ pshufb m2, m2, [off(intra8x9_vr3)]
+ pshufb m3, m8, [off(intra8x9_vr4)]
+ add r5, 0x40
+ call .sa8d
+ phaddd m11, m0
+
+; hd
+%if cpuflag(sse4)
+ pshufd m1, m9, q0001
+ pblendw m1, m8, q3330
+%else
+ pshufd m2, m9, q0001
+ movss m1, m8, m2
+%endif
+ punpcklbw m8, m9
+ pshufb m0, m1, [off(intra8x9_hd1)]
+ pshufb m1, m1, [off(intra8x9_hd2)]
+ pshufb m2, m8, [off(intra8x9_hd3)]
+ pshufb m3, m8, [off(intra8x9_hd4)]
+ add r5, 0x40
+ call .sa8d
+ phaddd m0, m12
+ phaddd m11, m0
+
+; hu
+%if cpuflag(sse4)
+ pinsrb m8, [r2+7], 15
+%else
+ movd m9, [r2+7]
+ pslldq m8, 1
+ palignr m9, m8, 1
+ SWAP 8, 9
+%endif
+ pshufb m0, m8, [off(intra8x9_hu1)]
+ pshufb m1, m8, [off(intra8x9_hu2)]
+ pshufb m2, m8, [off(intra8x9_hu3)]
+ pshufb m3, m8, [off(intra8x9_hu4)]
+ add r5, 0x80
+ call .sa8d
+
+ pmaddwd m0, [pw_1]
+ phaddw m10, m11
+ movhlps m1, m0
+ paddw m0, m1
+ pshuflw m1, m0, q0032
+ pavgw m0, m1
+ pxor m2, m2
+ pavgw m10, m2
+ movd r2d, m0
+
+ movu m0, [r3]
+ paddw m0, m10
+ mova [r4], m0
+ movzx r5d, word [r3+16]
+ add r2d, r5d
+ mov [r4+16], r2w
+
+%if cpuflag(sse4)
+ phminposuw m0, m0
+ movd eax, m0
+%else
+ ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
+ paddusw m0, m0
+ paddw m0, [off(pw_s00001111)]
+ movhlps m1, m0
+ pminsw m0, m1
+ pshuflw m1, m0, q0032
+ mova m2, m0
+ pminsw m0, m1
+ pcmpgtw m2, m1 ; 2nd index bit
+ movd r3d, m0
+ movd r4d, m2
+ ; repack with 3 bit index
+ xor r3d, 0x80008000
+ and r4d, 0x00020002
+ movzx eax, r3w
+ movzx r5d, r4w
+ shr r3d, 16
+ shr r4d, 16
+ lea eax, [rax*4+r5]
+ lea r3d, [ r3*4+r4+1]
+ cmp eax, r3d
+ cmovg eax, r3d
+ ; reverse to phminposuw order
+ mov r3d, eax
+ and eax, 7
+ shr r3d, 3
+ shl eax, 16
+ or eax, r3d
+%endif
+ add r2d, 8<<16
+ cmp ax, r2w
+ cmovg eax, r2d
+
+ mov r2d, eax
+ shr r2d, 16
+ shl r2d, 6
+ add r1, 4*FDEC_STRIDE
+ mova m0, [pred_buf+r2+0x00]
+ mova m1, [pred_buf+r2+0x10]
+ mova m2, [pred_buf+r2+0x20]
+ mova m3, [pred_buf+r2+0x30]
+ movq [r1+FDEC_STRIDE*-4], m0
+ movhps [r1+FDEC_STRIDE*-2], m0
+ movq [r1+FDEC_STRIDE*-3], m1
+ movhps [r1+FDEC_STRIDE*-1], m1
+ movq [r1+FDEC_STRIDE* 0], m2
+ movhps [r1+FDEC_STRIDE* 2], m2
+ movq [r1+FDEC_STRIDE* 1], m3
+ movhps [r1+FDEC_STRIDE* 3], m3
+ ADD rsp, pad
+ RET
+
+ALIGN 16
+.sa8d:
+ %xdefine mret m0
+ %xdefine fenc_buf fenc_buf+gprsize
+ mova [r5+0x00], m0
+ mova [r5+0x10], m1
+ mova [r5+0x20], m2
+ mova [r5+0x30], m3
+ movddup m4, m0
+ movddup m5, m1
+ movddup m6, m2
+ movddup m7, m3
+ punpckhqdq m0, m0
+ punpckhqdq m1, m1
+ punpckhqdq m2, m2
+ punpckhqdq m3, m3
+ PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ psubw m0, [fenc_buf+0x00]
+ psubw m1, [fenc_buf+0x10]
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ psubw m2, [fenc_buf+0x20]
+ psubw m3, [fenc_buf+0x30]
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ psubw m4, [fenc_buf+0x40]
+ psubw m5, [fenc_buf+0x50]
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ psubw m6, [fenc_buf+0x60]
+ psubw m7, [fenc_buf+0x70]
+ HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14
+ paddw m0, m1
+ paddw m0, m2
+ paddw mret, m0, m3
+ ret
+%endif ; ARCH_X86_64
+%endmacro ; INTRA8_X9
+
; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
; out: [tmp]=hadamard4, m0=satd
INIT_MMX mmx2
@@ -2911,12 +3736,12 @@ SA8D
HADAMARD_AC_SSE2
%ifndef HIGH_BIT_DEPTH
INTRA_X9
+INTRA8_X9
%endif
%undef movdqa ; nehalem doesn't like movaps
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
%ifndef HIGH_BIT_DEPTH
-INTRA_SA8D_SSE2
INIT_MMX ssse3
INTRA_X3_MMX
%endif
@@ -2929,14 +3754,25 @@ SA8D
HADAMARD_AC_SSE2
%ifndef HIGH_BIT_DEPTH
INTRA_X9
+INTRA8_X9
%endif
INIT_XMM avx
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
-INTRA_SA8D_SSE2
INTRA_X9
+INTRA8_X9
+%endif
+HADAMARD_AC_SSE2
+
+%define TRANS TRANS_XOP
+INIT_XMM xop
+SATDS_SSE2
+SA8D
+%ifndef HIGH_BIT_DEPTH
+INTRA_X9
+; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
%endif
HADAMARD_AC_SSE2
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 9f77055..09a7217 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -62,16 +62,19 @@ DECL_X1( ssd, sse2slow )
DECL_X1( ssd, sse2 )
DECL_X1( ssd, ssse3 )
DECL_X1( ssd, avx )
+DECL_X1( ssd, xop )
DECL_X1( satd, mmx2 )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
+DECL_X1( satd, xop )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
+DECL_X1( sa8d, xop )
DECL_X1( sad, cache32_mmx2 );
DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
@@ -84,18 +87,17 @@ DECL_X4( sad, cache64_ssse3 );
DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, int i_stride ))
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
-void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_4x4_mmx2 ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_4x4_sse4 ( uint8_t *, uint8_t *, int * );
-void x264_intra_sad_x3_4x4_avx ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
@@ -108,18 +110,21 @@ void x264_intra_sad_x3_16x16_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_16x16_ssse3 ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
-void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_avx ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
-void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * );
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
+int x264_intra_satd_x9_4x4_xop ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_ssse3 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
+int x264_intra_sa8d_x9_8x8_ssse3( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
+int x264_intra_sa8d_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
+int x264_intra_sa8d_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
+int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
+int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
+int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1,
pixel *pixuv2, int stride2, int width,
@@ -141,6 +146,11 @@ float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_mmx2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_var2_8x8_xop( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_var2_8x16_mmx2( pixel *, int, pixel *, int, int * );
+int x264_pixel_var2_8x16_sse2( pixel *, int, pixel *, int, int * );
+int x264_pixel_var2_8x16_ssse3( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
int x264_pixel_vsad_sse2( pixel *src, int stride, int height );
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index c9c502c..2486f35 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -582,9 +582,9 @@ PREDICT_4x4_V1 b
;-----------------------------------------------------------------------------
; void predict_4x4_dc( pixel *src )
;-----------------------------------------------------------------------------
+INIT_MMX mmx2
%ifdef HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_4x4_dc_mmx2, 1,1
+cglobal predict_4x4_dc, 1,1
mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
@@ -603,8 +603,7 @@ cglobal predict_4x4_dc_mmx2, 1,1
RET
%else ; !HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_4x4_dc_mmx2, 1,4
+cglobal predict_4x4_dc, 1,4
pxor mm7, mm7
movd mm0, [r0-FDEC_STRIDEB]
psadbw mm0, mm7
@@ -669,6 +668,7 @@ cglobal predict_8x8_filter, 4,6,6
add t4d, r5d
shr t4d, 2
mov [t1+7*SIZEOF_PIXEL], t4%1
+ mov [t1+6*SIZEOF_PIXEL], t4%1
test r3b, 2
je .done
.check_top:
@@ -797,8 +797,8 @@ PREDICT_8x8_H bw, W
; void predict_8x8_dc( pixel *src, pixel *edge );
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-cglobal predict_8x8_dc_sse2, 2,2
+INIT_XMM sse2
+cglobal predict_8x8_dc, 2,2
movu m0, [r1+14]
paddw m0, [r1+32]
HADDW m0, m1
@@ -809,8 +809,8 @@ cglobal predict_8x8_dc_sse2, 2,2
REP_RET
%else ; !HIGH_BIT_DEPTH
-INIT_MMX
-cglobal predict_8x8_dc_mmx2, 2,2
+INIT_MMX mmx2
+cglobal predict_8x8_dc, 2,2
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1+7]
@@ -839,9 +839,9 @@ cglobal %1, 2,2
STORE8x8 m0, m0
RET
%endmacro
-INIT_XMM
-PREDICT_8x8_DC predict_8x8_dc_top_sse2 , 32, mova
-PREDICT_8x8_DC predict_8x8_dc_left_sse2, 14, movu
+INIT_XMM sse2
+PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
+PREDICT_8x8_DC predict_8x8_dc_left, 14, movu
%else ; !HIGH_BIT_DEPTH
%macro PREDICT_8x8_DC 2
@@ -1106,9 +1106,9 @@ ALIGN 4
REP_RET
%endif ; !ARCH_X86_64
-INIT_XMM
+%macro PREDICT_8x8C 0
%ifdef HIGH_BIT_DEPTH
-cglobal predict_8x8c_p_core_sse2, 1,1,7
+cglobal predict_8x8c_p_core, 1,1,7
movd m0, r1m
movd m2, r2m
movd m4, r3m
@@ -1133,7 +1133,7 @@ cglobal predict_8x8c_p_core_sse2, 1,1,7
jg .loop
REP_RET
%else ; !HIGH_BIT_DEPTH
-cglobal predict_8x8c_p_core_sse2, 1,1
+cglobal predict_8x8c_p_core, 1,1
movd m0, r1m
movd m2, r2m
movd m4, r3m
@@ -1163,12 +1163,19 @@ call .loop
movhps [r0+FDEC_STRIDE*3], m5
RET
%endif ; HIGH_BIT_DEPTH
+%endmacro
+
+INIT_XMM sse2
+PREDICT_8x8C
+INIT_XMM avx
+PREDICT_8x8C
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
-cglobal predict_16x16_p_core_mmx2, 1,2
+INIT_MMX mmx2
+cglobal predict_16x16_p_core, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
@@ -1668,6 +1675,16 @@ PREDICT_C_H 16
; void predict_8x8c_dc( pixel *src )
;-----------------------------------------------------------------------------
+%macro LOAD_LEFT 1
+ movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
+ movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
+ add r1d, r2d
+ movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
+ add r1d, r2d
+ movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
+ add r1d, r2d
+%endmacro
+
%macro PREDICT_8x8C_DC 0
cglobal predict_8x8c_dc, 1,3
pxor m7, m7
@@ -1684,23 +1701,10 @@ cglobal predict_8x8c_dc, 1,3
%endif
add r0, FDEC_STRIDEB*4
- movzx r1d, pixel [r0-FDEC_STRIDEB*4-SIZEOF_PIXEL]
- movzx r2d, pixel [r0-FDEC_STRIDEB*3-SIZEOF_PIXEL]
- add r1d, r2d
- movzx r2d, pixel [r0-FDEC_STRIDEB*2-SIZEOF_PIXEL]
- add r1d, r2d
- movzx r2d, pixel [r0-FDEC_STRIDEB*1-SIZEOF_PIXEL]
- add r1d, r2d
- movd m2, r1d ; s2
-
- movzx r1d, pixel [r0+FDEC_STRIDEB*0-SIZEOF_PIXEL]
- movzx r2d, pixel [r0+FDEC_STRIDEB*1-SIZEOF_PIXEL]
- add r1d, r2d
- movzx r2d, pixel [r0+FDEC_STRIDEB*2-SIZEOF_PIXEL]
- add r1d, r2d
- movzx r2d, pixel [r0+FDEC_STRIDEB*3-SIZEOF_PIXEL]
- add r1d, r2d
- movd m3, r1d ; s3
+ LOAD_LEFT 0 ; s2
+ movd m2, r1d
+ LOAD_LEFT 4 ; s3
+ movd m3, r1d
punpcklwd m0, m1
punpcklwd m2, m3
@@ -1759,6 +1763,124 @@ INIT_MMX sse2
PREDICT_8x8C_DC
%endif
+%ifdef HIGH_BIT_DEPTH
+%macro STORE_4LINES 3
+%if cpuflag(sse2)
+ movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
+ movdqa [r0+FDEC_STRIDEB*(%3-3)], %1
+ movdqa [r0+FDEC_STRIDEB*(%3-2)], %1
+ movdqa [r0+FDEC_STRIDEB*(%3-1)], %1
+%else
+ movq [r0+FDEC_STRIDEB*(%3-4)+0], %1
+ movq [r0+FDEC_STRIDEB*(%3-4)+8], %2
+ movq [r0+FDEC_STRIDEB*(%3-3)+0], %1
+ movq [r0+FDEC_STRIDEB*(%3-3)+8], %2
+ movq [r0+FDEC_STRIDEB*(%3-2)+0], %1
+ movq [r0+FDEC_STRIDEB*(%3-2)+8], %2
+ movq [r0+FDEC_STRIDEB*(%3-1)+0], %1
+ movq [r0+FDEC_STRIDEB*(%3-1)+8], %2
+%endif
+%endmacro
+%else
+%macro STORE_4LINES 2
+ movq [r0+FDEC_STRIDEB*(%2-4)], %1
+ movq [r0+FDEC_STRIDEB*(%2-3)], %1
+ movq [r0+FDEC_STRIDEB*(%2-2)], %1
+ movq [r0+FDEC_STRIDEB*(%2-1)], %1
+%endmacro
+%endif
+
+%macro PREDICT_8x16C_DC 0
+cglobal predict_8x16c_dc, 1,3
+ pxor m7, m7
+%ifdef HIGH_BIT_DEPTH
+ movq m0, [r0-FDEC_STRIDEB+0]
+ movq m1, [r0-FDEC_STRIDEB+8]
+ HADDW m0, m2
+ HADDW m1, m2
+%else
+ movd m0, [r0-FDEC_STRIDEB+0]
+ movd m1, [r0-FDEC_STRIDEB+4]
+ psadbw m0, m7 ; s0
+ psadbw m1, m7 ; s1
+%endif
+ punpcklwd m0, m1 ; s0, s1
+
+ add r0, FDEC_STRIDEB*4
+ LOAD_LEFT 0 ; s2
+ pinsrw m0, r1d, 2
+ LOAD_LEFT 4 ; s3
+ pinsrw m0, r1d, 3 ; s0, s1, s2, s3
+ add r0, FDEC_STRIDEB*8
+ LOAD_LEFT 0 ; s4
+ pinsrw m1, r1d, 2
+ LOAD_LEFT 4 ; s5
+ pinsrw m1, r1d, 3 ; s1, __, s4, s5
+ sub r0, FDEC_STRIDEB*8
+
+ pshufw m2, m0, q1310 ; s0, s1, s3, s1
+ pshufw m0, m0, q3312 ; s2, s1, s3, s3
+ pshufw m3, m1, q0302 ; s4, s1, s5, s1
+ pshufw m1, m1, q3322 ; s4, s4, s5, s5
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 2
+ psrlw m1, 2
+ pavgw m0, m7
+ pavgw m1, m7
+%ifdef HIGH_BIT_DEPTH
+%if cpuflag(sse2)
+ movq2dq xmm0, m0
+ movq2dq xmm1, m1
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm1, xmm1
+ pshufd xmm2, xmm0, q3322
+ pshufd xmm3, xmm1, q3322
+ punpckldq xmm0, xmm0
+ punpckldq xmm1, xmm1
+ STORE_4LINES xmm0, xmm0, 0
+ STORE_4LINES xmm2, xmm2, 4
+ STORE_4LINES xmm1, xmm1, 8
+ STORE_4LINES xmm3, xmm3, 12
+%else
+ pshufw m2, m0, q0000
+ pshufw m3, m0, q1111
+ pshufw m4, m0, q2222
+ pshufw m5, m0, q3333
+ STORE_4LINES m2, m3, 0
+ STORE_4LINES m4, m5, 4
+ pshufw m2, m1, q0000
+ pshufw m3, m1, q1111
+ pshufw m4, m1, q2222
+ pshufw m5, m1, q3333
+ STORE_4LINES m2, m3, 8
+ STORE_4LINES m4, m5, 12
+%endif
+%else
+ packuswb m0, m0 ; dc0, dc1, dc2, dc3
+ packuswb m1, m1 ; dc4, dc5, dc6, dc7
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ pshufw m2, m0, q1100
+ pshufw m3, m0, q3322
+ pshufw m4, m1, q1100
+ pshufw m5, m1, q3322
+ STORE_4LINES m2, 0
+ STORE_4LINES m3, 4
+ add r0, FDEC_STRIDEB*8
+ STORE_4LINES m4, 0
+ STORE_4LINES m5, 4
+%endif
+ RET
+%endmacro
+
+INIT_MMX mmx2
+PREDICT_8x16C_DC
+%ifdef HIGH_BIT_DEPTH
+INIT_MMX sse2
+PREDICT_8x16C_DC
+%endif
+
%macro PREDICT_C_DC_TOP 1
%ifdef HIGH_BIT_DEPTH
INIT_XMM
@@ -1912,8 +2034,8 @@ PREDICT_16x16_H
%endif
%endmacro
-INIT_MMX
-cglobal predict_16x16_dc_core_mmx2, 1,2
+INIT_MMX mmx2
+cglobal predict_16x16_dc_core, 1,2
%ifdef ARCH_X86_64
movd m6, r1d
PRED16x16_DC m6, 5
@@ -1922,20 +2044,20 @@ cglobal predict_16x16_dc_core_mmx2, 1,2
%endif
REP_RET
-INIT_MMX
-cglobal predict_16x16_dc_top_mmx2, 1,2
+INIT_MMX mmx2
+cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC [pw_8], 4
REP_RET
-INIT_MMX
+INIT_MMX mmx2
%ifdef HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core_mmx2, 1,2
+cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
STORE16x16 m0, m0, m0, m0
REP_RET
%else ; !HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core_mmx2, 1,1
+cglobal predict_16x16_dc_left_core, 1,1
movd m0, r1m
pshufw m0, m0, 0
packuswb m0, m0
@@ -1969,25 +2091,25 @@ cglobal predict_16x16_dc_left_core_mmx2, 1,1
%endif
%endmacro
-INIT_XMM
-cglobal predict_16x16_dc_core_sse2, 2,2,4
+INIT_XMM sse2
+cglobal predict_16x16_dc_core, 2,2,4
movd m3, r1m
PRED16x16_DC_SSE2 m3, 5
REP_RET
-cglobal predict_16x16_dc_top_sse2, 1,2
+cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC_SSE2 [pw_8], 4
REP_RET
-INIT_XMM
+INIT_XMM sse2
%ifdef HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core_sse2, 1,2
+cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
STORE16x16_SSE2 m0, m0
REP_RET
%else ; !HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core_sse2, 1,1
+cglobal predict_16x16_dc_left_core, 1,1
movd m0, r1m
SPLATW m0, m0
packuswb m0, m0
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index a5d7a36..ab5f0d6 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -191,55 +191,65 @@ PREDICT_8x8_P( sse2 )
#endif //!HIGH_BIT_DEPTH
#if HAVE_X86_INLINE_ASM
+
+#define PREDICT_8x8C_P_CORE\
+ V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\
+ + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\
+ + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\
+ + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\
+ H += -4 * src[-1*FDEC_STRIDE -1];\
+ int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
+ int b = ( 17 * H + 16 ) >> 5;\
+ int c = ( 17 * V + 16 ) >> 5;
+
#if HIGH_BIT_DEPTH
-static void x264_predict_8x8c_p_sse2( uint16_t *src )
-#else
-static void x264_predict_8x8c_p_ssse3( uint8_t *src )
-#endif
-{
- int a, b, c, i00;
- int H, V;
-#if HIGH_BIT_DEPTH
- asm (
- "movdqa %1, %%xmm0 \n"
- "pmaddwd %2, %%xmm0 \n"
- "movhlps %%xmm0, %%xmm1 \n"
- "paddd %%xmm1, %%xmm0 \n"
- "pshuflw $14, %%xmm0, %%xmm1 \n"
- "paddd %%xmm1, %%xmm0 \n"
- "movd %%xmm0, %0 \n"
- :"=r"(H)
- :"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)
- );
-#else
- asm (
- "movq %1, %%mm0 \n"
- "pmaddubsw %2, %%mm0 \n"
- "pshufw $14, %%mm0, %%mm1 \n"
- "paddw %%mm1, %%mm0 \n"
- "pshufw $1, %%mm0, %%mm1 \n"
- "paddw %%mm1, %%mm0 \n"
- "movd %%mm0, %0 \n"
- "movswl %w0, %0 \n"
- :"=r"(H)
- :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)
- );
-#endif
- V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )
- + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )
- + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )
- + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );
- H += -4 * src[-1*FDEC_STRIDE -1];
- a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );
- b = ( 17 * H + 16 ) >> 5;
- c = ( 17 * V + 16 ) >> 5;
- i00 = a -3*b -3*c + 16;
-#if HIGH_BIT_DEPTH
- x264_predict_8x8c_p_core_sse2( src, a, b, c );
-#else
- x264_predict_8x8c_p_core_sse2( src, i00, b, c );
-#endif
+#define PREDICT_8x8_P2(cpu1, cpu2)\
+static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\
+{\
+ int H, V;\
+ asm (\
+ "movdqa %1, %%xmm0 \n"\
+ "pmaddwd %2, %%xmm0 \n"\
+ "movhlps %%xmm0, %%xmm1 \n"\
+ "paddd %%xmm1, %%xmm0 \n"\
+ "pshuflw $14, %%xmm0, %%xmm1 \n"\
+ "paddd %%xmm1, %%xmm0 \n"\
+ "movd %%xmm0, %0 \n"\
+ :"=r"(H)\
+ :"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)\
+ );\
+ PREDICT_8x8C_P_CORE\
+ x264_predict_8x8c_p_core_ ## cpu2( src, a, b, c );\
+}
+
+PREDICT_8x8_P2(sse2, sse2)
+PREDICT_8x8_P2( avx, avx)
+
+#else //!HIGH_BIT_DEPTH
+#define PREDICT_8x8_P2(cpu1, cpu2)\
+static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\
+{\
+ int H, V;\
+ asm (\
+ "movq %1, %%mm0 \n"\
+ "pmaddubsw %2, %%mm0 \n"\
+ "pshufw $14, %%mm0, %%mm1 \n"\
+ "paddw %%mm1, %%mm0 \n"\
+ "pshufw $1, %%mm0, %%mm1 \n"\
+ "paddw %%mm1, %%mm0 \n"\
+ "movd %%mm0, %0 \n"\
+ "movswl %w0, %0 \n"\
+ :"=r"(H)\
+ :"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)\
+ );\
+ PREDICT_8x8C_P_CORE\
+ int i00 = a -3*b -3*c + 16;\
+ x264_predict_8x8c_p_core_ ## cpu2( src, i00, b, c );\
}
+
+PREDICT_8x8_P2(ssse3, sse2)
+PREDICT_8x8_P2( avx, avx)
+#endif
#endif
#if ARCH_X86_64 && !HIGH_BIT_DEPTH
@@ -336,6 +346,9 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2;
#if HAVE_X86_INLINE_ASM
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_sse2;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx;
#endif
#else
#if ARCH_X86_64
@@ -358,6 +371,9 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3;
#if HAVE_X86_INLINE_ASM
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3;
+ if( !(cpu&X264_CPU_AVX) )
+ return;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx;
#endif
#endif // HIGH_BIT_DEPTH
}
@@ -367,16 +383,21 @@ void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] )
if( !(cpu&X264_CPU_MMX) )
return;
#if HIGH_BIT_DEPTH
- pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2;
+ if( !(cpu&X264_CPU_MMX2) )
+ return;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
if( !(cpu&X264_CPU_SSE2) )
return;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2;
#else
pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx;
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
if( !(cpu&X264_CPU_SSSE3) )
return;
diff --git a/common/x86/predict.h b/common/x86/predict.h
index 4bf639d..63e08de 100644
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -50,6 +50,8 @@ void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_dc_mmx2( pixel *src );
+void x264_predict_8x16c_dc_sse2( uint16_t *src );
void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
void x264_predict_8x16c_v_mmx( uint8_t *src );
@@ -59,6 +61,7 @@ void x264_predict_8x16c_h_sse2( pixel *src );
void x264_predict_8x16c_h_ssse3( uint8_t *src );
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
+void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_dc_mmx2( pixel *src );
void x264_predict_8x8c_dc_sse2( uint16_t *src );
void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 177e2d6..20b5d12 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -428,16 +428,13 @@ QUANT_AC quant_8x8, 8
;;; m4 0
mova m0, %1
%ifdef HIGH_BIT_DEPTH
- pmaddwd m0, %2
- paddd m0, m3
+ pmadcswd m0, m0, %2, m3
psrad m0, m2
%else
punpckhwd m1, m0, m4
punpcklwd m0, m4
- pmaddwd m0, %2
- pmaddwd m1, %3
- paddd m0, m3
- paddd m1, m3
+ pmadcswd m0, m0, %2, m3
+ pmadcswd m1, m1, %3, m3
psrad m0, m2
psrad m1, m2
packssdw m0, m1
@@ -574,6 +571,9 @@ cglobal dequant_%1x%1_flat16, 0,3
INIT_XMM sse2
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
+INIT_XMM xop
+DEQUANT 4, 4, 1
+DEQUANT 8, 6, 1
%else
%ifndef ARCH_X86_64
INIT_MMX mmx
@@ -586,6 +586,9 @@ DEQUANT 8, 6, 2
INIT_XMM avx
DEQUANT 4, 4, 2
DEQUANT 8, 6, 2
+INIT_XMM xop
+DEQUANT 4, 4, 2
+DEQUANT 8, 6, 2
%endif
%macro DEQUANT_DC 2
@@ -622,8 +625,7 @@ cglobal dequant_4x4dc, 0,3,6
pshufd m2, m2, 0
%rep SIZEOF_PIXEL*32/mmsize
mova m0, [r0+x]
- pmaddwd m0, m2
- paddd m0, m4
+ pmadcswd m0, m0, m2, m4
psrad m0, m3
mova [r0+x], m0
%assign x x+mmsize
@@ -651,6 +653,8 @@ cglobal dequant_4x4dc, 0,3,6
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
DEQUANT_DC d, pmaddwd
+INIT_XMM xop
+DEQUANT_DC d, pmaddwd
%else
%ifndef ARCH_X86_64
INIT_MMX mmx2
@@ -1153,12 +1157,25 @@ DECIMATE8x8
pmovmskb %2, mm0
%elif mmsize == 16
movdqa xmm0, [%3+ 0]
+%if %1 == 8
+ packssdw xmm0, [%3+16]
+ packsswb xmm0, xmm0
+%else
movdqa xmm1, [%3+32]
packssdw xmm0, [%3+16]
packssdw xmm1, [%3+48]
packsswb xmm0, xmm1
+%endif
pcmpeqb xmm0, xmm2
pmovmskb %2, xmm0
+%elif %1 == 8
+ movq mm0, [%3+ 0]
+ movq mm1, [%3+16]
+ packssdw mm0, [%3+ 8]
+ packssdw mm1, [%3+24]
+ packsswb mm0, mm1
+ pcmpeqb mm0, mm2
+ pmovmskb %2, mm0
%else
movq mm0, [%3+ 0]
movq mm1, [%3+16]
@@ -1194,11 +1211,38 @@ COEFF_LAST4
INIT_MMX mmx2, lzcnt
COEFF_LAST4
+%macro COEFF_LAST8 0
+cglobal coeff_last8, 1,3
+ pxor m2, m2
+ LAST_MASK 8, r1d, r0
+%if mmsize == 16
+ xor r1d, 0xffff
+ shr r1d, 8
+%else
+ xor r1d, 0xff
+%endif
+ BSR eax, r1d, 0x1f
+ RET
+%endmacro
+
+%ifndef ARCH_X86_64
+INIT_MMX mmx2
+COEFF_LAST8
+%endif
+INIT_XMM sse2
+COEFF_LAST8
+INIT_XMM sse2, lzcnt
+COEFF_LAST8
+
%else ; !HIGH_BIT_DEPTH
%macro LAST_MASK 3-4
+%if %1 <= 8
+ movq mm0, [%3+ 0]
%if %1 == 4
- movq mm0, [%3]
packsswb mm0, mm0
+%else
+ packsswb mm0, [%3+ 8]
+%endif
pcmpeqb mm0, mm2
pmovmskb %2, mm0
%elif mmsize == 16
@@ -1220,7 +1264,7 @@ COEFF_LAST4
%endif
%endmacro
-%macro COEFF_LAST4 0
+%macro COEFF_LAST48 0
%ifdef ARCH_X86_64
cglobal coeff_last4, 1,1
BSR rax, [r0], 0x3f
@@ -1239,12 +1283,19 @@ cglobal coeff_last4, 0,3
lea eax, [eax+ecx*2]
RET
%endif
+
+cglobal coeff_last8, 1,3
+ pxor m2, m2
+ LAST_MASK 8, r1d, r0, r2d
+ xor r1d, 0xff
+ BSR eax, r1d, 0x1f
+ RET
%endmacro
INIT_MMX mmx2
-COEFF_LAST4
+COEFF_LAST48
INIT_MMX mmx2, lzcnt
-COEFF_LAST4
+COEFF_LAST48
%endif ; HIGH_BIT_DEPTH
%macro COEFF_LAST 0
@@ -1364,11 +1415,19 @@ COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
%endif
COEFF_LEVELRUN 4
+COEFF_LEVELRUN 8
INIT_XMM sse2
+%ifdef HIGH_BIT_DEPTH
+COEFF_LEVELRUN 8
+%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_XMM sse2, lzcnt
+%ifdef HIGH_BIT_DEPTH
+COEFF_LEVELRUN 8
+%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_MMX mmx2, lzcnt
COEFF_LEVELRUN 4
+COEFF_LEVELRUN 8
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 77d2146..2b31401 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -53,6 +53,9 @@ void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
@@ -81,13 +84,17 @@ int x264_decimate_score64_mmx2( dctcoef *dct );
int x264_decimate_score64_sse2( dctcoef *dct );
int x264_decimate_score64_ssse3( dctcoef *dct );
int x264_coeff_last4_mmx2( dctcoef *dct );
+int x264_coeff_last8_mmx2( dctcoef *dct );
int x264_coeff_last15_mmx2( dctcoef *dct );
int x264_coeff_last16_mmx2( dctcoef *dct );
int x264_coeff_last64_mmx2( dctcoef *dct );
+int x264_coeff_last8_sse2( dctcoef *dct );
int x264_coeff_last15_sse2( dctcoef *dct );
int x264_coeff_last16_sse2( dctcoef *dct );
int x264_coeff_last64_sse2( dctcoef *dct );
int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct );
+int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct );
+int x264_coeff_last8_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
@@ -99,5 +106,9 @@ int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#endif
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index df79b92..0fad5b6 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -29,19 +29,6 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
-
-h4x4_pred_shuf: db 3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15
-h4x4_pred_shuf2: db 3,7,11,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
-h8x8_pred_shuf: times 8 db 1
- times 8 db 0
- times 8 db 3
- times 8 db 2
- times 8 db 5
- times 8 db 4
- times 8 db 7
- times 8 db 6
-
SECTION .text
cextern pb_3
@@ -385,45 +372,6 @@ cglobal intra_sad_x3_4x4_mmx2, 3,3
movd [r2+4], mm1 ;H prediction cost
RET
-%macro INTRA_SADx3_4x4 0
-cglobal intra_sad_x3_4x4, 3,3
- movd xmm4, [r1+FDEC_STRIDE*0-4]
- pinsrd xmm4, [r1+FDEC_STRIDE*1-4], 1
- pinsrd xmm4, [r1+FDEC_STRIDE*2-4], 2
- pinsrd xmm4, [r1+FDEC_STRIDE*3-4], 3
- movd xmm2, [r1-FDEC_STRIDE]
- pxor xmm3, xmm3
- pshufb xmm5, xmm4, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
- pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
- pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
- punpckldq xmm2, xmm4 ; ABCDEFGH
- psadbw xmm2, xmm3
- movd xmm1, [r0+FENC_STRIDE*0]
- pinsrd xmm1, [r0+FENC_STRIDE*1], 1
- pinsrd xmm1, [r0+FENC_STRIDE*2], 2
- pinsrd xmm1, [r0+FENC_STRIDE*3], 3
- psadbw xmm0, xmm1
- psadbw xmm5, xmm1
- psraw xmm2, 2
- pavgw xmm2, xmm3
- pshufb xmm2, xmm3 ; DC prediction
- punpckhqdq xmm3, xmm0, xmm5
- punpcklqdq xmm0, xmm5
- psadbw xmm2, xmm1
- paddw xmm0, xmm3
- movhlps xmm4, xmm2
- packusdw xmm0, xmm0
- paddw xmm2, xmm4
- movq [r2], xmm0 ; V/H prediction costs
- movd [r2+8], xmm2 ; DC prediction cost
- RET
-%endmacro ; INTRA_SADx3_4x4
-
-INIT_XMM sse4
-INTRA_SADx3_4x4
-INIT_XMM avx
-INTRA_SADx3_4x4
-
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
;-----------------------------------------------------------------------------
@@ -491,69 +439,6 @@ cglobal intra_sad_x3_8x8_mmx2, 3,3
movd [r2+8], m1
RET
-%macro INTRA_SADx3_8x8 0
-cglobal intra_sad_x3_8x8, 3,4,9
-%ifdef PIC
- lea r11, [h8x8_pred_shuf]
-%define shuf r11
-%else
-%define shuf h8x8_pred_shuf
-%endif
- movq m0, [r1+7] ; left pixels
- movq m1, [r1+16] ; top pixels
- pxor m2, m2
- pxor m3, m3
- psadbw m2, m0
- psadbw m3, m1
- paddw m2, m3
- pxor m3, m3 ; V score accumulator
- psraw m2, 3
- pavgw m2, m3
- punpcklqdq m1, m1 ; V prediction
- pshufb m2, m3 ; DC prediction
- pxor m4, m4 ; H score accumulator
- pxor m5, m5 ; DC score accumulator
- mov r3d, 6
-.loop:
- movq m6, [r0+FENC_STRIDE*0]
- movhps m6, [r0+FENC_STRIDE*1]
- pshufb m7, m0, [shuf+r3*8] ; H prediction
-%ifdef ARCH_X86_64
- psadbw m7, m6
- psadbw m8, m1, m6
- psadbw m6, m2
- paddw m4, m7
- paddw m3, m8
- paddw m5, m6
-%else
- psadbw m7, m6
- paddw m4, m7
- psadbw m7, m1, m6
- psadbw m6, m2
- paddw m3, m7
- paddw m5, m6
-%endif
- add r0, FENC_STRIDE*2
- sub r3d, 2
- jge .loop
-
- movhlps m0, m3
- movhlps m1, m4
- movhlps m2, m5
- paddw m3, m0
- paddw m4, m1
- paddw m5, m2
- movd [r2+0], m3
- movd [r2+4], m4
- movd [r2+8], m5
- RET
-%endmacro ; INTRA_SADx3_8x8
-
-INIT_XMM ssse3
-INTRA_SADx3_8x8
-INIT_XMM avx
-INTRA_SADx3_8x8
-
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 1cfb6fa..1bb64a9 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -531,6 +531,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%assign cpuflags_sse4 (1<<7) | cpuflags_ssse3
%assign cpuflags_sse42 (1<<8) | cpuflags_sse4
%assign cpuflags_avx (1<<9) | cpuflags_sse42
+%assign cpuflags_xop (1<<10)| cpuflags_avx
+%assign cpuflags_fma4 (1<<11)| cpuflags_avx
%assign cpuflags_cache32 (1<<16)
%assign cpuflags_cache64 (1<<17)
@@ -871,6 +873,8 @@ AVX_INSTR minpd, 1, 0
AVX_INSTR minps, 1, 0
AVX_INSTR minsd, 1, 0
AVX_INSTR minss, 1, 0
+AVX_INSTR movsd, 1, 0
+AVX_INSTR movss, 1, 0
AVX_INSTR mpsadbw, 0, 1
AVX_INSTR mulpd, 1, 0
AVX_INSTR mulps, 1, 0
@@ -986,7 +990,6 @@ AVX_INSTR pfadd, 1, 0
AVX_INSTR pfsub, 1, 0
AVX_INSTR pfmul, 1, 0
-
; base-4 constants for shuffles
%assign i 0
%rep 256
@@ -1004,3 +1007,18 @@ AVX_INSTR pfmul, 1, 0
%endrep
%undef i
%undef j
+
+%macro FMA_INSTR 3
+ %macro %1 4-7 %1, %2, %3
+ %if cpuflag(xop)
+ v%5 %1, %2, %3, %4
+ %else
+ %6 %1, %2, %3
+ %7 %1, %4
+ %endif
+ %endmacro
+%endmacro
+
+FMA_INSTR pmacsdd, pmulld, paddd
+FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index 07581d2..c3ab742 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -273,16 +273,28 @@
%endmacro
%macro HADDW 2
+%if cpuflag(xop) && mmsize == 16
+ vphaddwq %1, %1
+ movhlps %2, %1
+ paddd %1, %2
+%else
pmaddwd %1, [pw_1]
HADDD %1, %2
+%endif
%endmacro
%macro HADDUW 2
+%if cpuflag(xop) && mmsize == 16
+ vphadduwq %1, %1
+ movhlps %2, %1
+ paddd %1, %2
+%else
psrld %2, %1, 16
pslld %1, 16
psrld %1, 16
paddd %1, %2
HADDD %1, %2
+%endif
%endmacro
%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
@@ -457,6 +469,17 @@
%endif
%endmacro
+%macro TRANS_XOP 5-6
+%ifidn %1, d
+ vpperm m%5, m%3, m%4, [transd_shuf1]
+ vpperm m%3, m%3, m%4, [transd_shuf2]
+%elifidn %1, q
+ shufps m%5, m%3, m%4, q3131
+ shufps m%3, m%4, q2020
+%endif
+ SWAP %4, %5
+%endmacro
+
%macro HADAMARD 5-6
; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
@@ -546,6 +569,22 @@
%endif
%endmacro
+; doesn't include the "pmaddubsw hmul_8p" pass
+%macro HADAMARD8_2D_HMUL 10
+ HADAMARD4_V %1, %2, %3, %4, %9
+ HADAMARD4_V %5, %6, %7, %8, %9
+ SUMSUB_BADC w, %1, %5, %2, %6, %9
+ HADAMARD 2, sumsub, %1, %5, %9, %10
+ HADAMARD 2, sumsub, %2, %6, %9, %10
+ SUMSUB_BADC w, %3, %7, %4, %8, %9
+ HADAMARD 2, sumsub, %3, %7, %9, %10
+ HADAMARD 2, sumsub, %4, %8, %9, %10
+ HADAMARD 1, amax, %1, %5, %9, %10
+ HADAMARD 1, amax, %2, %6, %9, %5
+ HADAMARD 1, amax, %3, %7, %9, %5
+ HADAMARD 1, amax, %4, %8, %9, %5
+%endmacro
+
%macro SUMSUB2_AB 4
%ifnum %3
psub%1 m%4, m%2, m%3
@@ -736,3 +775,11 @@
packuswb %1, %1
movh %4, %1
%endmacro
+
+%macro SHUFFLE_MASK_W 8
+ %rep 8
+ db %1*2
+ db %1*2+1
+ %rotate 1
+ %endrep
+%endmacro
diff --git a/configure b/configure
index 3aadd18..f338b5f 100755
--- a/configure
+++ b/configure
@@ -30,6 +30,7 @@ Configuration options:
--disable-interlaced disable interlaced encoding support
--enable-visualize enable visualization (X11 only)
--bit-depth=BIT_DEPTH set output bit depth (8-10) [8]
+ --chroma-format=FORMAT output chroma format (420, 422, 444, all) [all]
Advanced options:
--disable-asm disable platform-specific assembly optimizations
@@ -237,6 +238,7 @@ strip="no"
pic="no"
vis="no"
bit_depth="8"
+chroma_format="all"
compiler="GNU"
CFLAGS="$CFLAGS -Wall -I."
@@ -357,6 +359,13 @@ for opt do
fi
bit_depth=`expr $bit_depth + 0`
;;
+ --chroma-format=*)
+ chroma_format="$optarg"
+ if [ $chroma_format != "420" -a $chroma_format != "422" -a $chroma_format != "444" -a $chroma_format != "all" ]; then
+ echo "Supplied chroma format must be 420, 422, 444 or all."
+ exit 1
+ fi
+ ;;
*)
echo "Unknown option $opt, ignored"
;;
@@ -465,6 +474,11 @@ case $host_os in
SYS="SunOS"
define HAVE_MALLOC_H
LDFLAGS="$LDFLAGS -lm"
+ if cc_check "" /usr/lib/64/values-xpg6.o; then
+ LDFLAGS="$LDFLAGS /usr/lib/64/values-xpg6.o"
+ else
+ LDFLAGS="$LDFLAGS /usr/lib/values-xpg6.o"
+ fi
HAVE_GETOPT_LONG=0
;;
*)
@@ -574,7 +588,7 @@ case $host_cpu in
s390|s390x)
ARCH="S390"
;;
- parisc|parisc64)
+ hppa*|parisc*)
ARCH="PARISC"
;;
ia64)
@@ -612,15 +626,15 @@ if [ $compiler != ICL ]; then
fi
fi
-if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" \) ] ; then
+if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" \) ] ; then
pic="yes"
fi
if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
- if ! as_check "vpaddw xmm0, xmm0, xmm0" ; then
+ if ! as_check "vpperm xmm0, xmm0, xmm0, xmm0" ; then
VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1`
echo "Found $VER"
- echo "Minimum version is yasm-0.7.0"
+ echo "Minimum version is yasm-1.0.0"
echo "If you really want to compile without asm, configure with --disable-asm."
exit 1
fi
@@ -861,7 +875,8 @@ if [ "$pic" = "yes" ] ; then
CFLAGS="$CFLAGS -fPIC"
ASFLAGS="$ASFLAGS -DPIC"
# resolve textrels in the x86 asm
- cc_check stdio.h -Wl,-Bsymbolic && LDFLAGS="$LDFLAGS -Wl,-Bsymbolic"
+ cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic"
+ [ $SYS = SunOS -a "$ARCH" = "X86" ] && SOFLAGS="$SOFLAGS -mimpure-text"
fi
if [ "$debug" != "yes" -a "$gprof" != "yes" ]; then
@@ -912,6 +927,10 @@ if [ "$bit_depth" -gt "8" ]; then
ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH"
fi
+if [ "$chroma_format" != "all" ]; then
+ define CHROMA_FORMAT CHROMA_$chroma_format
+fi
+
ASFLAGS="$ASFLAGS -DBIT_DEPTH=$bit_depth"
[ $gpl = yes ] && define HAVE_GPL && x264_gpl=1 || x264_gpl=0
@@ -966,10 +985,13 @@ rm -f conftest*
# generate exported config file
+config_chroma_format="X264_CSP_I$chroma_format"
+[ "$config_chroma_format" == "X264_CSP_Iall" ] && config_chroma_format="0"
cat > x264_config.h << EOF
-#define X264_BIT_DEPTH $bit_depth
-#define X264_GPL $x264_gpl
-#define X264_INTERLACED $x264_interlaced
+#define X264_BIT_DEPTH $bit_depth
+#define X264_GPL $x264_gpl
+#define X264_INTERLACED $x264_interlaced
+#define X264_CHROMA_FORMAT $config_chroma_format
EOF
# generate config files
@@ -1021,25 +1043,25 @@ if [ "$shared" = "yes" ]; then
echo 'IMPLIBNAME=libx264.dll.lib' >> config.mak
# GNU ld on windows defaults to exporting all global functions if there are no explicit __declspec(dllexport) declarations
# MSVC link does not act similarly, so it is required to make an export definition out of x264.h and use it at link time
- echo 'SOFLAGS=-dll -def:x264.def -implib:$(IMPLIBNAME)' >> config.mak
+ echo "SOFLAGS=-dll -def:x264.def -implib:\$(IMPLIBNAME) $SOFLAGS" >> config.mak
echo "EXPORTS" > x264.def
grep "^\(int\|void\|x264_t\|extern\).*x264.*[\[(;]" x264.h | sed -e "s/.*\(x264.*\)[\[(].*/\1/;s/.*\(x264.*\);/\1/;s/open/open_$API/g" >> x264.def
else
echo 'IMPLIBNAME=libx264.dll.a' >> config.mak
- echo 'SOFLAGS=-shared -Wl,--out-implib,$(IMPLIBNAME) -Wl,--enable-auto-image-base' >> config.mak
+ echo "SOFLAGS=-shared -Wl,--out-implib,\$(IMPLIBNAME) -Wl,--enable-auto-image-base $SOFLAGS" >> config.mak
fi
elif [ "$SYS" = "MACOSX" ]; then
echo "SOSUFFIX=dylib" >> config.mak
echo "SONAME=libx264.$API.dylib" >> config.mak
- echo 'SOFLAGS=-shared -dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name $(DESTDIR)$(libdir)/$(SONAME)' >> config.mak
+ echo "SOFLAGS=-shared -dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name \$(DESTDIR)\$(libdir)/\$(SONAME) $SOFLAGS" >> config.mak
elif [ "$SYS" = "SunOS" ]; then
echo "SOSUFFIX=so" >> config.mak
echo "SONAME=libx264.so.$API" >> config.mak
- echo 'SOFLAGS=-shared -Wl,-h,$(SONAME)' >> config.mak
+ echo "SOFLAGS=-shared -Wl,-h,\$(SONAME) $SOFLAGS" >> config.mak
else
echo "SOSUFFIX=so" >> config.mak
echo "SONAME=libx264.so.$API" >> config.mak
- echo 'SOFLAGS=-shared -Wl,-soname,$(SONAME)' >> config.mak
+ echo "SOFLAGS=-shared -Wl,-soname,\$(SONAME) $SOFLAGS" >> config.mak
fi
echo 'default: lib-shared' >> config.mak
echo 'install: install-lib-shared' >> config.mak
@@ -1088,27 +1110,28 @@ gpl_filters=""
[ $gpl = yes ] && filters="$filters $gpl_filters"
cat > conftest.log <<EOF
-Platform: $ARCH
-System: $SYS
-cli: $cli
-libx264: $cli_libx264
-shared: $shared
-static: $static
-asm: $asm
-interlaced: $interlaced
-avs: $avs
-lavf: $lavf
-ffms: $ffms
-gpac: $gpac
-gpl: $gpl
-thread: $thread
-filters: $filters
-debug: $debug
-gprof: $gprof
-strip: $strip
-PIC: $pic
-visualize: $vis
-bit depth: $bit_depth
+platform: $ARCH
+system: $SYS
+cli: $cli
+libx264: $cli_libx264
+shared: $shared
+static: $static
+asm: $asm
+interlaced: $interlaced
+avs: $avs
+lavf: $lavf
+ffms: $ffms
+gpac: $gpac
+gpl: $gpl
+thread: $thread
+filters: $filters
+debug: $debug
+gprof: $gprof
+strip: $strip
+PIC: $pic
+visualize: $vis
+bit depth: $bit_depth
+chroma format: $chroma_format
EOF
echo >> config.log
diff --git a/encoder/analyse.c b/encoder/analyse.c
index a514b57..83d8b5d 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -94,7 +94,7 @@ typedef struct
int i_satd_i8x8;
int i_cbp_i8x8_luma;
- int i_satd_i8x8_dir[12][4];
+ ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
int i_predict8x8[4];
int i_satd_i4x4;
@@ -844,6 +844,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
if( a->i_satd_i16x16 > i16x16_thresh )
return;
+ uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
@@ -870,53 +871,69 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
- if( !h->mb.b_lossless && predict_mode[5] >= 0 )
+ if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
{
- int satd[9];
- h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
- int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
- satd[i_pred_mode] -= 3 * lambda;
- for( int i = 2; i >= 0; i-- )
+ /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
+ i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
+ i_cost += i_best & 0xffff;
+ i_best >>= 16;
+ a->i_predict8x8[idx] = i_best;
+ if( idx == 3 || i_cost > i_satd_thresh )
+ break;
+ x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
+ }
+ else
+ {
+ if( !h->mb.b_lossless && predict_mode[5] >= 0 )
{
- int cost = satd[i];
- a->i_satd_i8x8_dir[i][idx] = cost + 4 * lambda;
- COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
+ int satd[9];
+ h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
+ int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
+ satd[i_pred_mode] -= 3 * lambda;
+ for( int i = 2; i >= 0; i-- )
+ {
+ int cost = satd[i];
+ a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
+ COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
+ }
+
+ /* Take analysis shortcuts: don't analyse modes that are too
+ * far away direction-wise from the favored mode. */
+ if( a->i_mbrd < 1 + a->b_fast_intra )
+ predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
+ else
+ predict_mode += 3;
}
- /* Take analysis shortcuts: don't analyse modes that are too
- * far away direction-wise from the favored mode. */
- if( a->i_mbrd < 1 + a->b_fast_intra )
- predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
- else
- predict_mode += 3;
- }
+ for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
+ {
+ int i_satd;
+ int i_mode = *predict_mode;
- for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
- {
- int i_satd;
- int i_mode = *predict_mode;
+ if( h->mb.b_lossless )
+ x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
+ else
+ h->predict_8x8[i_mode]( p_dst_by, edge );
- if( h->mb.b_lossless )
- x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
- else
- h->predict_8x8[i_mode]( p_dst_by, edge );
+ i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
+ if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
+ i_satd -= 3 * lambda;
- i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
- if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
- i_satd -= 3 * lambda;
+ COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
+ a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
+ }
+ i_cost += i_best + 3*lambda;
- COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
- a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * lambda;
+ if( idx == 3 || i_cost > i_satd_thresh )
+ break;
+ if( h->mb.b_lossless )
+ x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
+ else
+ h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
+ x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
}
- i_cost += i_best + 3 * lambda;
-
- if( idx == 3 || i_cost > i_satd_thresh )
- break;
-
/* we need to encode this block now (for next ones) */
- x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge );
-
- x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
+ x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
}
if( idx == 3 )
@@ -951,7 +968,6 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
{
int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
- uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
h->mb.i_cbp_luma = 0;
if( a->b_early_terminate && a->i_mbrd )
@@ -977,8 +993,12 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
{
/* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
- a->i_predict4x4[idx] = i_best >> 16;
- i_best &= 0xffff;
+ i_cost += i_best & 0xffff;
+ i_best >>= 16;
+ a->i_predict4x4[idx] = i_best;
+ if( i_cost > i_satd_thresh || idx == 15 )
+ break;
+ h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
}
else
{
@@ -1027,17 +1047,18 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
}
}
- i_best += 3 * lambda;
- }
- i_cost += i_best;
-
- if( i_cost > i_satd_thresh || idx == 15 )
- break;
+ i_cost += i_best + 3 * lambda;
+ if( i_cost > i_satd_thresh || idx == 15 )
+ break;
+ if( h->mb.b_lossless )
+ x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
+ else
+ h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
+ h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
+ }
/* we need to encode this block now (for next ones) */
- x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx] );
-
- h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
+ x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
}
if( idx == 15 )
{
@@ -1228,7 +1249,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
int cbp_luma_new = 0;
- int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8 : COST_MAX;
+ int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
i_best = COST_MAX64;
@@ -1239,7 +1260,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
for( ; *predict_mode >= 0; predict_mode++ )
{
int i_mode = *predict_mode;
- if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
+ if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
continue;
h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
@@ -1287,8 +1308,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
(m)->i_stride[1] = h->mb.pic.i_stride[1]; \
(m)->i_stride[2] = h->mb.pic.i_stride[2]; \
(m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
- (m)->p_fenc[1] = &(src)[1][((xoff)>>h->mb.chroma_h_shift)+((yoff)>>h->mb.chroma_v_shift)*FENC_STRIDE]; \
- (m)->p_fenc[2] = &(src)[2][((xoff)>>h->mb.chroma_h_shift)+((yoff)>>h->mb.chroma_v_shift)*FENC_STRIDE]; \
+ (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
+ (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
}
#define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
@@ -1309,7 +1330,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
(m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
} \
else \
- (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>h->mb.chroma_v_shift)*(m)->i_stride[1]]; \
+ (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
(m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->weight = x264_weight_none; \
(m)->i_ref = ref; \
@@ -1887,7 +1908,7 @@ static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *
} \
else \
{ \
- int v_shift = h->mb.chroma_v_shift; \
+ int v_shift = CHROMA_V_SHIFT; \
int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
@@ -1934,8 +1955,8 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
&p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
if( h->mb.b_chroma_me )
{
- int fenc_offset = (x>>h->mb.chroma_h_shift) + (y>>h->mb.chroma_v_shift)*FENC_STRIDE;
- int fdec_offset = (x>>h->mb.chroma_h_shift) + (y>>h->mb.chroma_v_shift)*FDEC_STRIDE;
+ int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
+ int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
&h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
+ h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
@@ -2097,7 +2118,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
- int v_shift = h->mb.chroma_v_shift;
+ int v_shift = CHROMA_V_SHIFT;
if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
{
@@ -3251,7 +3272,7 @@ intra_analysis:
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
if( !CHROMA444 )
{
- int height = 16 >> h->mb.chroma_v_shift;
+ int height = 16 >> CHROMA_V_SHIFT;
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
}
diff --git a/encoder/cabac.c b/encoder/cabac.c
index eb2bfe8..f3080fe 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -952,7 +952,7 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_
bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
if( chroma )
for( int ch = 1; ch < 3; ch++ )
- for( int i = 0; i < 16>>h->mb.chroma_v_shift; i++ )
+ for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ )
for( int j = 0; j < 8; j++ )
bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
@@ -1076,7 +1076,7 @@ if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy]
if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
{
- int step = 8 << h->mb.chroma_v_shift;
+ int step = 8 << CHROMA_V_SHIFT;
for( int i = 16; i < 3*16; i += step )
for( int j = i; j < i+4; j++ )
x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, b_intra );
@@ -1231,7 +1231,7 @@ static void x264_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
if( h->mb.i_cbp_chroma == 2 )
{
- int step = 8 << h->mb.chroma_v_shift;
+ int step = 8 << CHROMA_V_SHIFT;
for( int i = 16; i < 3*16; i += step )
for( int j = i; j < i+4; j++ )
x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, 1 );
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 9763abe..26af61f 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -178,7 +178,7 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct
if( ctx_block_cat == DCT_CHROMA_DC )
{
- if( i_total < 8>>h->mb.chroma_v_shift )
+ if( i_total < 8>>CHROMA_V_SHIFT )
{
vlc_t total_zeros = CHROMA_FORMAT == CHROMA_420 ? x264_total_zeros_2x2_dc[i_total-1][i_total_zero]
: x264_total_zeros_2x4_dc[i_total-1][i_total_zero];
@@ -202,7 +202,7 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
#define x264_cavlc_block_residual(h,cat,idx,l)\
{\
- int nC = cat == DCT_CHROMA_DC ? 3 + CHROMA_FORMAT\
+ int nC = cat == DCT_CHROMA_DC ? 5 - CHROMA_V_SHIFT\
: ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
if( !*nnz )\
@@ -505,7 +505,7 @@ void x264_macroblock_write_cavlc( x264_t *h )
bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
if( chroma )
for( int ch = 1; ch < 3; ch++ )
- for( int i = 0; i < 16>>h->mb.chroma_v_shift; i++ )
+ for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ )
for( int j = 0; j < 8; j++ )
bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
@@ -564,7 +564,7 @@ void x264_macroblock_write_cavlc( x264_t *h )
x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
{
- int step = 8 << h->mb.chroma_v_shift;
+ int step = 8 << CHROMA_V_SHIFT;
for( int i = 16; i < 3*16; i += step )
for( int j = i; j < i+4; j++ )
x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
@@ -691,7 +691,7 @@ static int x264_chroma_size_cavlc( x264_t *h )
if( h->mb.i_cbp_chroma == 2 )
{
- int step = 8 << h->mb.chroma_v_shift;
+ int step = 8 << CHROMA_V_SHIFT;
for( int i = 16; i < 3*16; i += step )
for( int j = i; j < i+4; j++ )
x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 0cae34a..72d716f 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -79,7 +79,7 @@ static void x264_frame_dump( x264_t *h )
if( !CHROMA444 )
{
int cw = h->param.i_width>>1;
- int ch = h->param.i_height>>h->mb.chroma_v_shift;
+ int ch = h->param.i_height>>CHROMA_V_SHIFT;
pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) );
pixel *planev = planeu + cw*ch + 16;
h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
@@ -418,6 +418,23 @@ static int x264_validate_parameters( x264_t *h, int b_open )
}
int i_csp = h->param.i_csp & X264_CSP_MASK;
+#if X264_CHROMA_FORMAT
+ if( CHROMA_FORMAT != CHROMA_420 && i_csp >= X264_CSP_I420 && i_csp <= X264_CSP_NV12 )
+ {
+ x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" );
+ return -1;
+ }
+ else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_NV16 )
+ {
+ x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" );
+ return -1;
+ }
+ else if( CHROMA_FORMAT != CHROMA_444 && i_csp >= X264_CSP_I444 && i_csp <= X264_CSP_RGB )
+ {
+ x264_log( h, X264_LOG_ERROR, "not compiled with 4:4:4 support\n" );
+ return -1;
+ }
+#endif
if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
{
x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
@@ -947,6 +964,8 @@ static void mbcmp_init( x264_t *h )
h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
h->pixf.intra_mbcmp_x9_4x4 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL
: satd ? h->pixf.intra_satd_x9_4x4 : h->pixf.intra_sad_x9_4x4;
+ h->pixf.intra_mbcmp_x9_8x8 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL
+ : satd ? h->pixf.intra_sa8d_x9_8x8 : h->pixf.intra_sad_x9_8x8;
satd &= h->param.analyse.i_me_method == X264_ME_TESA;
memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
@@ -961,6 +980,7 @@ static void chroma_dsp_init( x264_t *h )
{
case CHROMA_420:
memcpy( h->predict_chroma, h->predict_8x8c, sizeof(h->predict_chroma) );
+ h->mc.prefetch_fenc = h->mc.prefetch_fenc_420;
h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_420;
h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_420_intra;
h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_420_mbaff;
@@ -971,6 +991,7 @@ static void chroma_dsp_init( x264_t *h )
break;
case CHROMA_422:
memcpy( h->predict_chroma, h->predict_8x16c, sizeof(h->predict_chroma) );
+ h->mc.prefetch_fenc = h->mc.prefetch_fenc_422;
h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_422;
h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_422_intra;
h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_422_mbaff;
@@ -980,6 +1001,7 @@ static void chroma_dsp_init( x264_t *h )
h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run8;
break;
case CHROMA_444:
+ h->mc.prefetch_fenc = h->mc.prefetch_fenc_422; /* FIXME: doesn't cover V plane */
h->loopf.deblock_chroma_mbaff = h->loopf.deblock_luma_mbaff;
h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_luma_intra_mbaff;
break;
@@ -1832,7 +1854,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
* consistency by copying deblocked pixels between planes. */
if( PARAM_INTERLACED )
for( int p = 0; p < h->fdec->i_plane; p++ )
- for( int i = minpix_y>>(h->mb.chroma_v_shift && p); i < maxpix_y>>(h->mb.chroma_v_shift && p); i++ )
+ for( int i = minpix_y>>(CHROMA_V_SHIFT && p); i < maxpix_y>>(CHROMA_V_SHIFT && p); i++ )
memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p],
h->fdec->plane[p] + i*h->fdec->i_stride[p],
h->mb.i_mb_width*16*sizeof(pixel) );
@@ -1871,7 +1893,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
if( !CHROMA444 )
{
uint64_t ssd_u, ssd_v;
- int v_shift = h->mb.chroma_v_shift;
+ int v_shift = CHROMA_V_SHIFT;
x264_pixel_ssd_nv12( &h->pixf,
h->fdec->plane[1] + (minpix_y>>v_shift) * h->fdec->i_stride[1], h->fdec->i_stride[1],
h->fenc->plane[1] + (minpix_y>>v_shift) * h->fenc->i_stride[1], h->fenc->i_stride[1],
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 8251349..5bbc3ce 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -112,28 +112,6 @@ static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
return 0;
}
-static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
-{
- int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
- if( h->mb.b_noise_reduction )
- h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
- if( h->mb.b_trellis )
- return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 );
- else
- return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
-}
-
-static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
-{
- int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY);
- if( h->mb.b_noise_reduction )
- h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 );
- if( h->mb.b_trellis )
- return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 );
- else
- return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
-}
-
/* All encoding functions must output the correct CBP and NNZ values.
* The entropy coding functions will check CBP first, then NNZ, before
* actually reading the DCT coefficients. NNZ still must be correct even
@@ -145,99 +123,6 @@ static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, i
/* This means that decimation can be done merely by adjusting the CBP and NNZ
* rather than memsetting the coefficients. */
-void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode )
-{
- int nz;
- pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
- pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
-
- if( h->mb.b_lossless )
- x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode );
- else
- h->predict_4x4[i_mode]( p_dst );
-
- if( h->mb.b_lossless )
- {
- nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst );
- h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
- h->mb.i_cbp_luma |= nz<<(idx>>2);
- return;
- }
-
- h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
-
- nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx );
- h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
- if( nz )
- {
- h->mb.i_cbp_luma |= 1<<(idx>>2);
- h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 );
- h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp );
- h->dctf.add4x4_idct( p_dst, dct4x4 );
- }
-}
-
-#define STORE_8x8_NNZ( p, idx, nz )\
-do\
-{\
- M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\
- M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\
-} while(0)
-
-#define CLEAR_16x16_NNZ( p ) \
-do\
-{\
- M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 0]] ) = 0;\
- M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 2]] ) = 0;\
- M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 8]] ) = 0;\
- M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\
-} while(0)
-
-void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge )
-{
- int x = idx&1;
- int y = idx>>1;
- int nz;
- pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
- pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
- ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
- ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
-
- if( !edge )
- {
- h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] );
- edge = edge_buf;
- }
-
- if( h->mb.b_lossless )
- x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge );
- else
- h->predict_8x8[i_mode]( p_dst, edge );
-
- if( h->mb.b_lossless )
- {
- nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst );
- STORE_8x8_NNZ( p, idx, nz );
- h->mb.i_cbp_luma |= nz<<idx;
- return;
- }
-
- h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
-
- nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx );
- if( nz )
- {
- h->mb.i_cbp_luma |= 1<<idx;
- h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 );
- h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp );
- h->dctf.add8x8_idct8( p_dst, dct8x8 );
- STORE_8x8_NNZ( p, idx, 1 );
- }
- else
- STORE_8x8_NNZ( p, idx, 0 );
-}
-
static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
{
pixel *p_src = h->mb.pic.p_fenc[p];
@@ -602,7 +487,7 @@ static void x264_macroblock_encode_skip( x264_t *h )
void x264_predict_lossless_chroma( x264_t *h, int i_mode )
{
- int height = 16 >> h->mb.chroma_v_shift;
+ int height = 16 >> CHROMA_V_SHIFT;
if( i_mode == I_PRED_CHROMA_V )
{
h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
@@ -686,7 +571,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
if( chroma )
{
- int height = 16 >> h->mb.chroma_v_shift;
+ int height = 16 >> CHROMA_V_SHIFT;
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
}
@@ -722,7 +607,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
if( chroma )
{
- int v_shift = h->mb.chroma_v_shift;
+ int v_shift = CHROMA_V_SHIFT;
int height = 16 >> v_shift;
/* Special case for mv0, which is (of course) very common in P-skip mode. */
@@ -788,7 +673,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
{
int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
- x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL );
+ x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );
}
i_qp = h->mb.i_chroma_qp;
}
@@ -820,7 +705,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
/* emulate missing topright samples */
MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
- x264_mb_encode_i4x4( h, p, i, i_qp, i_mode );
+ x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );
}
i_qp = h->mb.i_chroma_qp;
}
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index d8ca95d..f8c2149 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -52,8 +52,6 @@ void x264_macroblock_write_cavlc ( x264_t *h );
void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
-void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode );
-void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge );
void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );
void x264_cabac_mb_skip( x264_t *h, int b_skip );
@@ -68,5 +66,126 @@ int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
void x264_noise_reduction_update( x264_t *h );
+static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
+{
+ int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
+ if( h->mb.b_noise_reduction )
+ h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
+ if( h->mb.b_trellis )
+ return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 );
+ else
+ return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
+}
+
+static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
+{
+ int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY);
+ if( h->mb.b_noise_reduction )
+ h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 );
+ if( h->mb.b_trellis )
+ return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 );
+ else
+ return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
+}
+
+#define STORE_8x8_NNZ( p, idx, nz )\
+do\
+{\
+ M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\
+ M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\
+} while(0)
+
+#define CLEAR_16x16_NNZ( p ) \
+do\
+{\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 0]] ) = 0;\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 2]] ) = 0;\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 8]] ) = 0;\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\
+} while(0)
+
+static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict )
+{
+ int nz;
+ pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
+ pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
+ ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
+
+ if( b_predict )
+ {
+ if( h->mb.b_lossless )
+ x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode );
+ else
+ h->predict_4x4[i_mode]( p_dst );
+ }
+
+ if( h->mb.b_lossless )
+ {
+ nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst );
+ h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
+ h->mb.i_cbp_luma |= nz<<(idx>>2);
+ return;
+ }
+
+ h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
+
+ nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx );
+ h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
+ if( nz )
+ {
+ h->mb.i_cbp_luma |= 1<<(idx>>2);
+ h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 );
+ h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp );
+ h->dctf.add4x4_idct( p_dst, dct4x4 );
+ }
+}
+
+static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge, int b_predict )
+{
+ int x = idx&1;
+ int y = idx>>1;
+ int nz;
+ pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
+ pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
+ ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
+ ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
+
+ if( b_predict )
+ {
+ if( !edge )
+ {
+ h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] );
+ edge = edge_buf;
+ }
+
+ if( h->mb.b_lossless )
+ x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge );
+ else
+ h->predict_8x8[i_mode]( p_dst, edge );
+ }
+
+ if( h->mb.b_lossless )
+ {
+ nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst );
+ STORE_8x8_NNZ( p, idx, nz );
+ h->mb.i_cbp_luma |= nz<<idx;
+ return;
+ }
+
+ h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
+
+ nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx );
+ if( nz )
+ {
+ h->mb.i_cbp_luma |= 1<<idx;
+ h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 );
+ h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp );
+ h->dctf.add8x8_idct8( p_dst, dct8x8 );
+ STORE_8x8_NNZ( p, idx, 1 );
+ }
+ else
+ STORE_8x8_NNZ( p, idx, 0 );
+}
+
#endif
diff --git a/encoder/me.c b/encoder/me.c
index 75ae29d..00bb412 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -830,7 +830,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
const int i_pixel = m->i_pixel;
const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444);
int chromapix = h->luma2chroma_pixel[i_pixel];
- int chroma_v_shift = h->mb.chroma_v_shift;
+ int chroma_v_shift = CHROMA_V_SHIFT;
int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
@@ -978,8 +978,8 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] );
pixel *src[3][2][9];
int chromapix = h->luma2chroma_pixel[i_pixel];
- int chroma_v_shift = h->mb.chroma_v_shift;
- int chroma_x = (8 >> h->mb.chroma_h_shift) * x;
+ int chroma_v_shift = CHROMA_V_SHIFT;
+ int chroma_x = (8 >> CHROMA_H_SHIFT) * x;
int chroma_y = (8 >> chroma_v_shift) * y;
pixel *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
pixel *pixu = &h->mb.pic.p_fdec[1][chroma_x + chroma_y*FDEC_STRIDE];
@@ -1176,7 +1176,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
const int i_pixel = m->i_pixel;
- int chroma_v_shift = h->mb.chroma_v_shift;
+ int chroma_v_shift = CHROMA_V_SHIFT;
int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
uint64_t bcost = COST_MAX64;
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 9343874..aad66cd 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -219,7 +219,7 @@ static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_f
static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int b_chroma, int b_field, int b_store )
{
- int height = b_chroma ? 16>>h->mb.chroma_v_shift : 16;
+ int height = b_chroma ? 16>>CHROMA_V_SHIFT : 16;
int stride = frame->i_stride[i];
int offset = b_field
? 16 * mb_x + height * (mb_y&~1) * stride + (mb_y&1) * stride
@@ -229,7 +229,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
{
ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*16] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
- int shift = 7 - h->mb.chroma_v_shift;
+ int shift = 7 - CHROMA_V_SHIFT;
h->mc.load_deinterleave_chroma_fenc( pix, frame->plane[1] + offset, stride, height );
return ac_energy_var( h->pixf.var[chromapix]( pix, FENC_STRIDE ), shift, frame, 1, b_store )
@@ -247,6 +247,7 @@ static NOINLINE uint32_t x264_ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_
* function and make sure that its always called before the float math. Noinline makes
* sure no reordering goes on. */
uint32_t var;
+ x264_prefetch_fenc( h, frame, mb_x, mb_y );
if( h->mb.b_adaptive_mbaff )
{
/* We don't know the super-MB mode we're going to pick yet, so
@@ -382,8 +383,8 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
{
uint64_t ssd = frame->i_pixel_ssd[i];
uint64_t sum = frame->i_pixel_sum[i];
- int width = 16*h->mb.i_mb_width >> (i && h->mb.chroma_h_shift);
- int height = 16*h->mb.i_mb_height >> (i && h->mb.chroma_v_shift);
+ int width = 16*h->mb.i_mb_width >> (i && CHROMA_H_SHIFT);
+ int height = 16*h->mb.i_mb_height >> (i && CHROMA_V_SHIFT);
frame->i_pixel_ssd[i] = ssd - (sum * sum + width * height / 2) / (width * height);
}
}
@@ -2384,7 +2385,7 @@ static float rate_estimate_qscale( x264_t *h )
}
}
-void x264_threads_normalize_predictors( x264_t *h )
+static void x264_threads_normalize_predictors( x264_t *h )
{
double totalsize = 0;
for( int i = 0; i < h->param.i_threads; i++ )
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 80b6cf0..f02c50b 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -249,8 +249,8 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
int ssd_y = 8*(i8>>1);
i_ssd = ssd_plane( h, i_pixel, 0, ssd_x, ssd_y );
int chromapix = h->luma2chroma_pixel[i_pixel];
- int chromassd = ssd_plane( h, chromapix, 1, ssd_x>>h->mb.chroma_h_shift, ssd_y>>h->mb.chroma_v_shift )
- + ssd_plane( h, chromapix, 2, ssd_x>>h->mb.chroma_h_shift, ssd_y>>h->mb.chroma_v_shift );
+ int chromassd = ssd_plane( h, chromapix, 1, ssd_x>>CHROMA_H_SHIFT, ssd_y>>CHROMA_V_SHIFT )
+ + ssd_plane( h, chromapix, 2, ssd_x>>CHROMA_H_SHIFT, ssd_y>>CHROMA_V_SHIFT );
i_ssd += ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
if( h->param.b_cabac )
@@ -276,7 +276,7 @@ static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode,
for( int p = 0; p < plane_count; p++ )
{
- x264_mb_encode_i8x8( h, p, i8, i_qp, i_mode, edge[p] );
+ x264_mb_encode_i8x8( h, p, i8, i_qp, i_mode, edge[p], 1 );
i_qp = h->mb.i_chroma_qp;
}
@@ -310,7 +310,7 @@ static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode
for( int p = 0; p < plane_count; p++ )
{
- x264_mb_encode_i4x4( h, p, i4, i_qp, i_mode );
+ x264_mb_encode_i4x4( h, p, i4, i_qp, i_mode, 1 );
i_qp = h->mb.i_chroma_qp;
}
diff --git a/encoder/set.c b/encoder/set.c
index 5e1ff64..876176a 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -278,7 +278,7 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
if( sps->i_profile_idc >= PROFILE_HIGH )
{
bs_write_ue( s, sps->i_chroma_format_idc );
- if( sps->i_chroma_format_idc == 3 )
+ if( sps->i_chroma_format_idc == CHROMA_444 )
bs_write1( s, 0 ); // separate_colour_plane_flag
bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_luma_minus8
bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_chroma_minus8
@@ -515,7 +515,7 @@ void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps )
bs_write1( s, 0 ); // Cr = Cb
if( pps->b_transform_8x8_mode )
{
- if( sps->i_chroma_format_idc == 3 )
+ if( sps->i_chroma_format_idc == CHROMA_444 )
{
scaling_list_write( s, pps, CQM_8IY+4 );
scaling_list_write( s, pps, CQM_8IC+4 );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 8a1ebb1..e6fa93a 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -109,7 +109,7 @@ static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc
int i_offset = i_stride / 2;
int i_lines = fenc->i_lines[1];
int i_width = fenc->i_width[1];
- int v_shift = h->mb.chroma_v_shift;
+ int v_shift = CHROMA_V_SHIFT;
int cw = 8*h->mb.i_mb_width;
int ch = 16*h->mb.i_mb_height >> v_shift;
int height = 16 >> v_shift;
@@ -227,7 +227,7 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
ALIGNED_ARRAY_16( pixel, buf, [8*16] );
int pixoff = 0;
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
- int height = 16 >> h->mb.chroma_v_shift;
+ int height = 16 >> CHROMA_V_SHIFT;
ALIGNED_16( static pixel flat[8] ) = {0};
if( w )
{
@@ -283,7 +283,7 @@ static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t
return cost;
}
-void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
+static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
{
int i_delta_index = fenc->i_frame - ref->i_frame - 1;
/* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
diff --git a/input/timecode.c b/input/timecode.c
index cfec6c9..143304e 100644
--- a/input/timecode.c
+++ b/input/timecode.c
@@ -368,8 +368,6 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
timecode_input.picture_alloc = h->input.picture_alloc;
timecode_input.picture_clean = h->input.picture_clean;
- *p_handle = h;
-
tcfile_in = fopen( psz_filename, "rb" );
FAIL_IF_ERROR( !tcfile_in, "can't open `%s'\n", psz_filename )
else if( !x264_is_regular_file( tcfile_in ) )
@@ -392,6 +390,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
info->timebase_den = h->timebase_den;
info->vfr = 1;
+ *p_handle = h;
return 0;
}
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 802732a..1958a7d 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -55,7 +55,7 @@ int quiet = 0;
#define BENCH_RUNS 100 // tradeoff between accuracy and speed
#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
-#define MAX_CPUS 10 // number of different combinations of cpu flags
+#define MAX_CPUS 30 // number of different combinations of cpu flags
typedef struct
{
@@ -164,13 +164,14 @@ static void print_bench(void)
if( k < j )
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+ b->cpu&X264_CPU_FMA4 ? "fma4" :
+ b->cpu&X264_CPU_XOP ? "xop" :
b->cpu&X264_CPU_AVX ? "avx" :
b->cpu&X264_CPU_SSE4 ? "sse4" :
- b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
/* print sse2slow only if there's also a sse2fast version of the same func */
- b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
+ b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" :
b->cpu&X264_CPU_ALTIVEC ? "altivec" :
@@ -178,6 +179,7 @@ static void print_bench(void)
b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
+ b->cpu&X264_CPU_SHUFFLE_IS_FAST && !(b->cpu&X264_CPU_SSE4) ? "_fastshuffle" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
@@ -469,26 +471,108 @@ static int check_pixel( int cpu_ref, int cpu_new )
ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
for( int i=0; i<17; i++ ) \
bitcosts[i] = 9*(i!=8); \
+ memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
+ memcpy( pbuf4, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
for( int i=0; i<32; i++ ) \
{ \
pixel *fenc = pbuf1+48+i*12; \
- pixel *fdec = pbuf3+48+i*12; \
+ pixel *fdec1 = pbuf3+48+i*12; \
+ pixel *fdec2 = pbuf4+48+i*12; \
int pred_mode = i%9; \
int res_c = INT_MAX; \
for( int j=0; j<9; j++ ) \
{ \
- predict_4x4[j]( fdec ); \
- int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec, FDEC_STRIDE ) + 9*(j!=pred_mode); \
+ predict_4x4[j]( fdec1 ); \
+ int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
if( cost < (uint16_t)res_c ) \
res_c = cost + (j<<16); \
} \
- int res_a = call_a( pixel_asm.name, fenc, fdec, bitcosts+8-pred_mode ); \
+ predict_4x4[res_c>>16]( fdec1 ); \
+ int res_a = call_a( pixel_asm.name, fenc, fdec2, bitcosts+8-pred_mode ); \
if( res_c != res_a ) \
{ \
ok = 0; \
fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
break; \
} \
+ if( memcmp(fdec1, fdec2, 4*FDEC_STRIDE*sizeof(pixel)) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #name" [FAILED]\n" ); \
+ for( int j=0; j<16; j++ ) \
+ fprintf( stderr, "%02x ", fdec1[(j&3)+(j>>2)*FDEC_STRIDE] ); \
+ fprintf( stderr, "\n" ); \
+ for( int j=0; j<16; j++ ) \
+ fprintf( stderr, "%02x ", fdec2[(j&3)+(j>>2)*FDEC_STRIDE] ); \
+ fprintf( stderr, "\n" ); \
+ break; \
+ } \
+ } \
+ }
+
+#define TEST_INTRA8_X9( name, cmp ) \
+ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
+ { \
+ set_func_name( #name ); \
+ used_asm = 1; \
+ ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
+ ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \
+ ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \
+ memset( satds_c, 0, 16 * sizeof(*satds_c) ); \
+ memset( satds_a, 0, 16 * sizeof(*satds_a) ); \
+ for( int i=0; i<17; i++ ) \
+ bitcosts[i] = 9*(i!=8); \
+ for( int i=0; i<32; i++ ) \
+ { \
+ pixel *fenc = pbuf1+48+i*12; \
+ pixel *fdec1 = pbuf3+48+i*12; \
+ pixel *fdec2 = pbuf4+48+i*12; \
+ int pred_mode = i%9; \
+ int res_c = INT_MAX; \
+ predict_8x8_filter( fdec1, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); \
+ for( int j=0; j<9; j++ ) \
+ { \
+ predict_8x8[j]( fdec1, edge ); \
+ satds_c[j] = pixel_c.cmp[PIXEL_8x8]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
+ if( satds_c[j] < (uint16_t)res_c ) \
+ res_c = satds_c[j] + (j<<16); \
+ } \
+ predict_8x8[res_c>>16]( fdec1, edge ); \
+ int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \
+ if( res_c != res_a || memcmp(satds_c, satds_a, sizeof(satds_c)) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
+ for( int j = 0; j < 9; j++ ) \
+ fprintf( stderr, "%5d ", satds_c[j]); \
+ fprintf( stderr, "\n" ); \
+ for( int j = 0; j < 9; j++ ) \
+ fprintf( stderr, "%5d ", satds_a[j]); \
+ fprintf( stderr, "\n" ); \
+ break; \
+ } \
+ for( int j=0; j<8; j++ ) \
+ if( memcmp(fdec1+j*FDEC_STRIDE, fdec2+j*FDEC_STRIDE, 8*sizeof(pixel)) ) \
+ ok = 0; \
+ if( !ok ) \
+ { \
+ fprintf( stderr, #name" [FAILED]\n" ); \
+ for( int j=0; j<8; j++ ) \
+ { \
+ for( int k=0; k<8; k++ ) \
+ fprintf( stderr, "%02x ", fdec1[k+j*FDEC_STRIDE] ); \
+ fprintf( stderr, "\n" ); \
+ } \
+ fprintf( stderr, "\n" ); \
+ for( int j=0; j<8; j++ ) \
+ { \
+ for( int k=0; k<8; k++ ) \
+ fprintf( stderr, "%02x ", fdec2[k+j*FDEC_STRIDE] ); \
+ fprintf( stderr, "\n" ); \
+ } \
+ fprintf( stderr, "\n" ); \
+ break; \
+ } \
} \
}
@@ -509,9 +593,11 @@ static int check_pixel( int cpu_ref, int cpu_new )
report( "intra sad_x3 :" );
ok = 1; used_asm = 0;
TEST_INTRA_X9( intra_satd_x9_4x4, satd );
+ TEST_INTRA8_X9( intra_sa8d_x9_8x8, sa8d );
report( "intra satd_x9 :" );
ok = 1; used_asm = 0;
TEST_INTRA_X9( intra_sad_x9_4x4, sad );
+ TEST_INTRA8_X9( intra_sad_x9_8x8, sad );
report( "intra sad_x9 :" );
ok = 1; used_asm = 0;
@@ -565,7 +651,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
{
ALIGNED_16( uint16_t sums[72] );
ALIGNED_16( int dc[4] );
- int16_t mvs_a[32], mvs_c[32];
+ ALIGNED_16( int16_t mvs_a[32] );
+ ALIGNED_16( int16_t mvs_c[32] );
int mvn_a, mvn_c;
int thresh = rand() & 0x3fff;
set_func_name( "esa_ads" );
@@ -1365,8 +1452,12 @@ static int check_mc( int cpu_ref, int cpu_new )
call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
// I don't care about exact rounding, this is just how close the floating-point implementation happens to be
x264_emms();
- for( int j = 0; j < 100; j++ )
+ for( int j = 0; j < 100 && ok; j++ )
+ {
ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
+ if( !ok )
+ fprintf( stderr, "mbtree_propagate FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
+ }
}
report( "mbtree propagate :" );
}
@@ -2205,6 +2296,9 @@ static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
{
*cpu_ref = *cpu_new;
*cpu_new |= flags;
+#if BROKEN_STACK_ALIGNMENT
+ *cpu_new |= X264_CPU_STACK_MOD4;
+#endif
if( *cpu_new & X264_CPU_SSE2_IS_FAST )
*cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
if( !quiet )
@@ -2239,6 +2333,7 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
@@ -2248,23 +2343,24 @@ static int check_all_flags( void )
}
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
{
- cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
cpu1 &= ~X264_CPU_SSE_MISALIGN;
}
if( x264_cpu_detect() & X264_CPU_LZCNT )
{
- cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
if( x264_cpu_detect() & X264_CPU_SSE3 )
+ {
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
+ }
if( x264_cpu_detect() & X264_CPU_SSSE3 )
{
- cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
@@ -2273,12 +2369,13 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_SLOW_ATOM;
}
if( x264_cpu_detect() & X264_CPU_SSE4 )
- {
- cpu1 &= ~X264_CPU_CACHELINE_64;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
- }
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4 | X264_CPU_SHUFFLE_IS_FAST, "SSE4" );
if( x264_cpu_detect() & X264_CPU_AVX )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
+ if( x264_cpu_detect() & X264_CPU_XOP )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
+ if( x264_cpu_detect() & X264_CPU_FMA4 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
diff --git a/version.sh b/version.sh
index 9790f89..9c40514 100755
--- a/version.sh
+++ b/version.sh
@@ -1,5 +1,5 @@
#!/bin/sh
# Script modified from upstream source for Debian packaging since packaging
# won't include .git repository.
-echo '#define X264_VERSION " r2092 6eac7c3"'
-echo '#define X264_POINTVER "0.118.2092 6eac7c3"'
+echo '#define X264_VERSION " r2113 cc129ad"'
+echo '#define X264_POINTVER "0.119.2113 cc129ad"'
diff --git a/x264.c b/x264.c
index 1e78b27..b40b873 100644
--- a/x264.c
+++ b/x264.c
@@ -53,6 +53,7 @@
#endif
#if HAVE_SWSCALE
+#undef DECLARE_ALIGNED
#include <libswscale/swscale.h>
#endif
@@ -121,7 +122,19 @@ static const char * const muxer_names[] =
static const char * const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 };
static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 };
-static const char * const output_csp_names[] = { "i420", "i422", "i444", "rgb", 0 };
+static const char * const output_csp_names[] =
+{
+#if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I420
+ "i420",
+#endif
+#if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I422
+ "i422",
+#endif
+#if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I444
+ "i444", "rgb",
+#endif
+ 0
+};
typedef struct
{
@@ -234,7 +247,7 @@ static void print_version_info()
#else
printf( "using an unknown compiler\n" );
#endif
- printf( "configuration: --bit-depth=%d\n", x264_bit_depth );
+ printf( "configuration: --bit-depth=%d --chroma-format=%s\n", x264_bit_depth, X264_CHROMA_FORMAT ? (output_csp_names[0]+1) : "all" );
printf( "x264 license: " );
#if HAVE_GPL
printf( "GPL version 2 or later\n" );
@@ -426,6 +439,7 @@ static void help( x264_param_t *defaults, int longhelp )
H0( " --profile <string> Force the limits of an H.264 profile\n"
" Overrides all settings.\n" );
H2(
+#if X264_CHROMA_FORMAT <= X264_CSP_I420
#if BIT_DEPTH==8
" - baseline:\n"
" --no-8x8dct --bframes 0 --no-cabac\n"
@@ -441,19 +455,28 @@ static void help( x264_param_t *defaults, int longhelp )
" - high10:\n"
" No lossless.\n"
" Support for bit depth 8-10.\n"
+#endif
+#if X264_CHROMA_FORMAT <= X264_CSP_I422
" - high422:\n"
" No lossless.\n"
" Support for bit depth 8-10.\n"
" Support for 4:2:0/4:2:2 chroma subsampling.\n"
+#endif
" - high444:\n"
" Support for bit depth 8-10.\n"
" Support for 4:2:0/4:2:2/4:4:4 chroma subsampling.\n" );
else H0(
+ " - "
+#if X264_CHROMA_FORMAT <= X264_CSP_I420
#if BIT_DEPTH==8
- " - baseline,main,high,high10,high422,high444\n"
-#else
- " - high10,high422,high444\n"
+ "baseline,main,high,"
+#endif
+ "high10,"
#endif
+#if X264_CHROMA_FORMAT <= X264_CSP_I422
+ "high422,"
+#endif
+ "high444\n"
);
H0( " --preset <string> Use a preset to select encoding settings [medium]\n"
" Overridden by user settings.\n" );
@@ -1373,7 +1396,11 @@ static int parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
case OPT_OUTPUT_CSP:
FAIL_IF_ERROR( parse_enum_value( optarg, output_csp_names, &output_csp ), "Unknown output csp `%s'\n", optarg )
// correct the parsed value to the libx264 csp value
+#if X264_CHROMA_FORMAT
+ static const uint8_t output_csp_fix[] = { X264_CHROMA_FORMAT, X264_CSP_RGB };
+#else
static const uint8_t output_csp_fix[] = { X264_CSP_I420, X264_CSP_I422, X264_CSP_I444, X264_CSP_RGB };
+#endif
param->i_csp = output_csp = output_csp_fix[output_csp];
break;
default:
diff --git a/x264.h b/x264.h
index afae0e2..5e88e48 100644
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
#include "x264_config.h"
-#define X264_BUILD 118
+#define X264_BUILD 119
/* x264_t:
* opaque handler for encoder */
@@ -100,31 +100,33 @@ typedef struct
****************************************************************************/
/* CPU flags
*/
-#define X264_CPU_CACHELINE_32 0x000001 /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64 0x000002 /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_ALTIVEC 0x000004
-#define X264_CPU_MMX 0x000008
-#define X264_CPU_MMX2 0x000010 /* MMX2 aka MMXEXT aka ISSE */
-#define X264_CPU_MMXEXT X264_CPU_MMX2
-#define X264_CPU_SSE 0x000020
-#define X264_CPU_SSE2 0x000040
-#define X264_CPU_SSE2_IS_SLOW 0x000080 /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST 0x000100 /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SSE3 0x000200
-#define X264_CPU_SSSE3 0x000400
-#define X264_CPU_SHUFFLE_IS_FAST 0x000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */
-#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
-#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
-#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
-#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_ARMV6 0x020000
-#define X264_CPU_NEON 0x040000 /* ARM NEON */
-#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
-#define X264_CPU_SLOW_CTZ 0x100000 /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM 0x200000 /* The Atom just sucks */
-#define X264_CPU_AVX 0x400000 /* AVX support: requires OS support even if YMM registers
- * aren't used. */
+#define X264_CPU_CACHELINE_32 0x0000001 /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64 0x0000002 /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_ALTIVEC 0x0000004
+#define X264_CPU_MMX 0x0000008
+#define X264_CPU_MMX2 0x0000010 /* MMX2 aka MMXEXT aka ISSE */
+#define X264_CPU_MMXEXT X264_CPU_MMX2
+#define X264_CPU_SSE 0x0000020
+#define X264_CPU_SSE2 0x0000040
+#define X264_CPU_SSE2_IS_SLOW 0x0000080 /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST 0x0000100 /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SSE3 0x0000200
+#define X264_CPU_SSSE3 0x0000400
+#define X264_CPU_SHUFFLE_IS_FAST 0x0000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */
+#define X264_CPU_STACK_MOD4 0x0001000 /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SSE4 0x0002000 /* SSE4.1 */
+#define X264_CPU_SSE42 0x0004000 /* SSE4.2 */
+#define X264_CPU_SSE_MISALIGN 0x0008000 /* Phenom support for misaligned SSE instruction arguments */
+#define X264_CPU_LZCNT 0x0010000 /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_ARMV6 0x0020000
+#define X264_CPU_NEON 0x0040000 /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC 0x0080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
+#define X264_CPU_SLOW_CTZ 0x0100000 /* BSR/BSF x86 instructions are really slow on some CPUs */
+#define X264_CPU_SLOW_ATOM 0x0200000 /* The Atom just sucks */
+#define X264_CPU_AVX 0x0400000 /* AVX support: requires OS support even if YMM registers
+ * aren't used. */
+#define X264_CPU_XOP 0x0800000 /* AMD XOP */
+#define X264_CPU_FMA4 0x1000000 /* AMD FMA4 */
/* Analyse flags
*/
@@ -607,6 +609,13 @@ int x264_param_apply_profile( x264_param_t *, const char *profile );
* colorspace depth as well. */
extern const int x264_bit_depth;
+/* x264_chroma_format:
+ * Specifies the chroma formats that x264 supports encoding. When this
+ * value is non-zero, then it represents a X264_CSP_* that is the only
+ * chroma format that x264 supports encoding. If the value is 0 then
+ * there are no restrictions. */
+extern const int x264_chroma_format;
+
enum pic_struct_e
{
PIC_STRUCT_AUTO = 0, // automatically decide (default)
--
x264 packaging
More information about the pkg-multimedia-commits
mailing list