[SCM] x264/upstream: New upstream version 0.152.2854+gite9a5903
ricotz-guest at users.alioth.debian.org
ricotz-guest at users.alioth.debian.org
Sun Dec 31 13:12:46 UTC 2017
The following commit has been merged in the upstream branch:
commit e80e4ac59271f3037ad8b7252e59dc50340d6d8e
Author: Rico Tzschichholz <ricotz at ubuntu.com>
Date: Sun Dec 31 13:54:44 2017 +0100
New upstream version 0.152.2854+gite9a5903
diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
index 48209b2..047d3db 100644
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -569,57 +569,65 @@ endfunc
.macro pixel_var2_8 h
function x264_pixel_var2_8x\h\()_neon, export=1
- ld1 {v16.8b}, [x0], x1
- ld1 {v18.8b}, [x2], x3
- ld1 {v17.8b}, [x0], x1
- ld1 {v19.8b}, [x2], x3
- mov x5, \h - 4
- usubl v6.8h, v16.8b, v18.8b
- usubl v7.8h, v17.8b, v19.8b
- ld1 {v16.8b}, [x0], x1
- ld1 {v18.8b}, [x2], x3
- smull v2.4s, v6.4h, v6.4h
- smull2 v3.4s, v6.8h, v6.8h
- add v0.8h, v6.8h, v7.8h
- smlal v2.4s, v7.4h, v7.4h
- smlal2 v3.4s, v7.8h, v7.8h
+ mov x3, #16
+ ld1 {v16.8b}, [x0], #8
+ ld1 {v18.8b}, [x1], x3
+ ld1 {v17.8b}, [x0], #8
+ ld1 {v19.8b}, [x1], x3
+ mov x5, \h - 2
+ usubl v0.8h, v16.8b, v18.8b
+ usubl v1.8h, v17.8b, v19.8b
+ ld1 {v16.8b}, [x0], #8
+ ld1 {v18.8b}, [x1], x3
+ smull v2.4s, v0.4h, v0.4h
+ smull2 v3.4s, v0.8h, v0.8h
+ smull v4.4s, v1.4h, v1.4h
+ smull2 v5.4s, v1.8h, v1.8h
usubl v6.8h, v16.8b, v18.8b
-1: subs x5, x5, #2
- ld1 {v17.8b}, [x0], x1
- ld1 {v19.8b}, [x2], x3
+1: subs x5, x5, #1
+ ld1 {v17.8b}, [x0], #8
+ ld1 {v19.8b}, [x1], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
- ld1 {v16.8b}, [x0], x1
- ld1 {v18.8b}, [x2], x3
- smlal v2.4s, v7.4h, v7.4h
- smlal2 v3.4s, v7.8h, v7.8h
+ ld1 {v16.8b}, [x0], #8
+ ld1 {v18.8b}, [x1], x3
+ smlal v4.4s, v7.4h, v7.4h
+ smlal2 v5.4s, v7.8h, v7.8h
usubl v6.8h, v16.8b, v18.8b
- add v0.8h, v0.8h, v7.8h
+ add v1.8h, v1.8h, v7.8h
b.gt 1b
- ld1 {v17.8b}, [x0], x1
- ld1 {v19.8b}, [x2], x3
+ ld1 {v17.8b}, [x0], #8
+ ld1 {v19.8b}, [x1], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
- smlal v2.4s, v7.4h, v7.4h
- add v0.8h, v0.8h, v7.8h
- smlal2 v3.4s, v7.8h, v7.8h
+ smlal v4.4s, v7.4h, v7.4h
+ add v1.8h, v1.8h, v7.8h
+ smlal2 v5.4s, v7.8h, v7.8h
saddlv s0, v0.8h
+ saddlv s1, v1.8h
add v2.4s, v2.4s, v3.4s
+ add v4.4s, v4.4s, v5.4s
mov w0, v0.s[0]
- addv s1, v2.4s
- sxtw x0, w0
mov w1, v1.s[0]
- mul x0, x0, x0
- str w1, [x4]
- sub x0, x1, x0, lsr # 6 + (\h >> 4)
+ addv s2, v2.4s
+ addv s4, v4.4s
+ mul w0, w0, w0
+ mul w1, w1, w1
+ mov w3, v2.s[0]
+ mov w4, v4.s[0]
+ sub w0, w3, w0, lsr # 6 + (\h >> 4)
+ sub w1, w4, w1, lsr # 6 + (\h >> 4)
+ str w3, [x2]
+ add w0, w0, w1
+ str w4, [x2, #4]
ret
endfunc
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index 8a7b83e..5206a0c 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -61,8 +61,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
-int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
+int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index f562009..155e1cf 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -719,13 +719,24 @@ function x264_var_end, export=0
bx lr
endfunc
-.macro DIFF_SUM diff da db lastdiff
- vld1.64 {\da}, [r0,:64], r1
- vld1.64 {\db}, [r2,:64], r3
-.ifnb \lastdiff
- vadd.s16 q0, q0, \lastdiff
+.macro DIFF_SUM diff1 diff2 da1 db1 da2 db2 lastdiff1 lastdiff2 acc1 acc2
+ vld1.64 {\da1}, [r0,:64]!
+ vld1.64 {\db1}, [r1,:64], r3
+.ifnb \lastdiff1
+ vadd.s16 \acc1, \acc1, \lastdiff1
+ vadd.s16 \acc2, \acc2, \lastdiff2
.endif
- vsubl.u8 \diff, \da, \db
+ vld1.64 {\da2}, [r0,:64]!
+ vld1.64 {\db2}, [r1,:64], r3
+ vsubl.u8 \diff1, \da1, \db1
+ vsubl.u8 \diff2, \da2, \db2
+.endm
+
+.macro SQR_ACC_DOUBLE acc1 acc2 d0 d1 d2 d3 vmlal=vmlal.s16
+ \vmlal \acc1, \d0, \d0
+ vmlal.s16 \acc1, \d1, \d1
+ \vmlal \acc2, \d2, \d2
+ vmlal.s16 \acc2, \d3, \d3
.endm
.macro SQR_ACC acc d0 d1 vmlal=vmlal.s16
@@ -734,77 +745,89 @@ endfunc
.endm
function x264_pixel_var2_8x8_neon
- DIFF_SUM q0, d0, d1
- DIFF_SUM q8, d16, d17
- SQR_ACC q1, d0, d1, vmull.s16
- DIFF_SUM q9, d18, d19, q8
- SQR_ACC q2, d16, d17, vmull.s16
+ mov r3, #16
+ DIFF_SUM q0, q10, d0, d1, d20, d21
+ DIFF_SUM q8, q11, d16, d17, d22, d23
+ SQR_ACC_DOUBLE q1, q13, d0, d1, d20, d21, vmull.s16
+ DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10
+ SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23, vmull.s16
.rept 2
- DIFF_SUM q8, d16, d17, q9
- SQR_ACC q1, d18, d19
- DIFF_SUM q9, d18, d19, q8
- SQR_ACC q2, d16, d17
+ DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10
+ SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25
+ DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10
+ SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23
.endr
- DIFF_SUM q8, d16, d17, q9
- SQR_ACC q1, d18, d19
+ DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10
+ SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25
vadd.s16 q0, q0, q8
- SQR_ACC q2, d16, d17
+ vadd.s16 q10, q10, q11
+ SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23
- ldr ip, [sp]
vadd.s16 d0, d0, d1
+ vadd.s16 d20, d20, d21
vadd.s32 q1, q1, q2
+ vadd.s32 q13, q13, q14
vpaddl.s16 d0, d0
+ vpaddl.s16 d20, d20
vadd.s32 d1, d2, d3
- vpadd.s32 d0, d0, d1
+ vadd.s32 d26, d26, d27
+ vpadd.s32 d0, d0, d20 @ sum
+ vpadd.s32 d1, d1, d26 @ sqr
+ vmul.s32 d0, d0, d0 @ sum*sum
+ vshr.s32 d0, d0, #6
+ vsub.s32 d0, d1, d0
+ vpadd.s32 d0, d0, d0
vmov r0, r1, d0
- vst1.32 {d0[1]}, [ip,:32]
- mul r0, r0, r0
- sub r0, r1, r0, lsr #6
+ vst1.32 {d1}, [r2,:64]
bx lr
endfunc
function x264_pixel_var2_8x16_neon
- vld1.64 {d16}, [r0,:64], r1
- vld1.64 {d17}, [r2,:64], r3
- vld1.64 {d18}, [r0,:64], r1
- vld1.64 {d19}, [r2,:64], r3
+ mov r3, #16
+ vld1.64 {d16}, [r0,:64]!
+ vld1.64 {d17}, [r1,:64], r3
+ vld1.64 {d18}, [r0,:64]!
+ vld1.64 {d19}, [r1,:64], r3
+ vsubl.u8 q0, d16, d17
+ vsubl.u8 q3, d18, d19
+ SQR_ACC q1, d0, d1, vmull.s16
+ vld1.64 {d16}, [r0,:64]!
+ mov ip, #15
+ vld1.64 {d17}, [r1,:64], r3
+ SQR_ACC q2, d6, d7, vmull.s16
+1: subs ip, ip, #1
+ vld1.64 {d18}, [r0,:64]!
vsubl.u8 q10, d16, d17
- vsubl.u8 q11, d18, d19
- SQR_ACC q1, d20, d21, vmull.s16
- vld1.64 {d16}, [r0,:64], r1
- vadd.s16 q0, q10, q11
- vld1.64 {d17}, [r2,:64], r3
- SQR_ACC q2, d22, d23, vmull.s16
- mov ip, #14
-1: subs ip, ip, #2
- vld1.64 {d18}, [r0,:64], r1
- vsubl.u8 q10, d16, d17
- vld1.64 {d19}, [r2,:64], r3
+ vld1.64 {d19}, [r1,:64], r3
vadd.s16 q0, q0, q10
SQR_ACC q1, d20, d21
vsubl.u8 q11, d18, d19
beq 2f
- vld1.64 {d16}, [r0,:64], r1
- vadd.s16 q0, q0, q11
- vld1.64 {d17}, [r2,:64], r3
+ vld1.64 {d16}, [r0,:64]!
+ vadd.s16 q3, q3, q11
+ vld1.64 {d17}, [r1,:64], r3
SQR_ACC q2, d22, d23
b 1b
2:
- vadd.s16 q0, q0, q11
+ vadd.s16 q3, q3, q11
SQR_ACC q2, d22, d23
- ldr ip, [sp]
vadd.s16 d0, d0, d1
- vadd.s32 q1, q1, q2
+ vadd.s16 d6, d6, d7
vpaddl.s16 d0, d0
- vadd.s32 d1, d2, d3
- vpadd.s32 d0, d0, d1
+ vpaddl.s16 d6, d6
+ vadd.s32 d2, d2, d3
+ vadd.s32 d4, d4, d5
+ vpadd.s32 d0, d0, d6 @ sum
+ vpadd.s32 d2, d2, d4 @ sqr
+ vmul.s32 d0, d0, d0 @ sum*sum
+ vshr.s32 d0, d0, #7
+ vsub.s32 d0, d2, d0
+ vpadd.s32 d0, d0, d0
vmov r0, r1, d0
- vst1.32 {d0[1]}, [ip,:32]
- mul r0, r0, r0
- sub r0, r1, r0, lsr #7
+ vst1.32 {d2}, [r2,:64]
bx lr
endfunc
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index 8a6751b..d9b02c4 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -63,8 +63,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
-int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
+int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
diff --git a/common/bitstream.c b/common/bitstream.c
index d6c1c2c..cc76300 100644
--- a/common/bitstream.c
+++ b/common/bitstream.c
@@ -43,16 +43,19 @@ uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end );
void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_internal_sse2_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
-void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_avx2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_avx512( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
@@ -116,7 +119,7 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
pf->nal_escape = x264_nal_escape_c;
#if HAVE_MMX
-#if ARCH_X86_64
+#if ARCH_X86_64 && !defined( __MACH__ )
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
@@ -126,18 +129,17 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
pf->nal_escape = x264_nal_escape_mmx2;
if( cpu&X264_CPU_SSE2 )
{
-#if ARCH_X86_64
- if( cpu&X264_CPU_LZCNT )
- {
- pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt;
- pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt;
- pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt;
- }
-#endif
if( cpu&X264_CPU_SSE2_IS_FAST )
pf->nal_escape = x264_nal_escape_sse2;
}
-#if ARCH_X86_64
+#if ARCH_X86_64 && !defined( __MACH__ )
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt;
+ pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_lzcnt;
+ pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_lzcnt;
+ }
+
if( cpu&X264_CPU_SSSE3 )
{
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3;
@@ -152,8 +154,14 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
if( cpu&X264_CPU_AVX2 )
{
pf->nal_escape = x264_nal_escape_avx2;
- if( cpu&X264_CPU_BMI2 )
- pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2;
+ pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2;
+ }
+
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512;
+ pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512;
+ pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512;
}
#endif
#endif
diff --git a/common/cabac.h b/common/cabac.h
index 5af856a..1378834 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -42,7 +42,7 @@ typedef struct
uint8_t *p_end;
/* aligned for memcpy_aligned starting here */
- ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
+ ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
/* context */
uint8_t state[1024];
diff --git a/common/common.c b/common/common.c
index 14d4670..561212d 100644
--- a/common/common.c
+++ b/common/common.c
@@ -669,7 +669,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
{
if( !strcmp(value, "1b") )
p->i_level_idc = 9;
- else if( atof(value) < 6 )
+ else if( atof(value) < 7 )
p->i_level_idc = (int)(10*atof(value)+.5);
else
p->i_level_idc = atoi(value);
@@ -1143,6 +1143,8 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
[X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_NV16] = { 2, { 256*1, 256*1 }, { 256*1, 256*1 }, },
+ [X264_CSP_YUYV] = { 1, { 256*2 }, { 256*1 }, },
+ [X264_CSP_UYVY] = { 1, { 256*2 }, { 256*1 }, },
[X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_BGR] = { 1, { 256*3 }, { 256*1 }, },
diff --git a/common/common.h b/common/common.h
index 8cc1dc1..867b207 100644
--- a/common/common.h
+++ b/common/common.h
@@ -635,11 +635,11 @@ struct x264_t
/* Current MB DCT coeffs */
struct
{
- ALIGNED_32( dctcoef luma16x16_dc[3][16] );
+ ALIGNED_64( dctcoef luma16x16_dc[3][16] );
ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
- ALIGNED_32( dctcoef luma8x8[12][64] );
- ALIGNED_32( dctcoef luma4x4[16*3][16] );
+ ALIGNED_64( dctcoef luma8x8[12][64] );
+ ALIGNED_64( dctcoef luma4x4[16*3][16] );
} dct;
/* MB table and cache for current frame/mb */
@@ -729,7 +729,7 @@ struct x264_t
int8_t *type; /* mb type */
uint8_t *partition; /* mb partition */
int8_t *qp; /* mb qp */
- int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc (all set for PCM)*/
+ int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x200 and 0x400: chroma dc, 0x1000 PCM (all set for PCM) */
int8_t (*intra4x4_pred_mode)[8]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
/* actually has only 7 entries; set to 8 for write-combining optimizations */
uint8_t (*non_zero_count)[16*3]; /* nzc. for I_PCM set to 16 */
@@ -740,8 +740,7 @@ struct x264_t
int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */
int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */
- uint16_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of
- * NOTE: this will fail on resolutions above 2^16 MBs... */
+ uint32_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of */
uint8_t *field;
/* buffer for weighted versions of the reference frames */
@@ -778,26 +777,27 @@ struct x264_t
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
- ALIGNED_32( pixel fenc_buf[48*FENC_STRIDE] );
- ALIGNED_32( pixel fdec_buf[52*FDEC_STRIDE] );
+ ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
+ ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
ALIGNED_16( pixel i8x8_fdec_buf[16*16] );
- ALIGNED_16( dctcoef i8x8_dct_buf[3][64] );
- ALIGNED_16( dctcoef i4x4_dct_buf[15][16] );
+ ALIGNED_64( dctcoef i8x8_dct_buf[3][64] );
+ ALIGNED_64( dctcoef i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
uint32_t i8x8_nnz_buf[4];
- int i4x4_cbp;
- int i8x8_cbp;
/* Psy trellis DCT data */
ALIGNED_16( dctcoef fenc_dct8[4][64] );
ALIGNED_16( dctcoef fenc_dct4[16][16] );
/* Psy RD SATD/SA8D scores cache */
- ALIGNED_32( uint64_t fenc_hadamard_cache[9] );
- ALIGNED_32( uint32_t fenc_satd_cache[32] );
+ ALIGNED_64( uint32_t fenc_satd_cache[32] );
+ ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
+
+ int i4x4_cbp;
+ int i8x8_cbp;
/* pointer over mb of the frame to be compressed */
pixel *p_fenc[3]; /* y,u,v */
@@ -822,10 +822,10 @@ struct x264_t
struct
{
/* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
- ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
+ ALIGNED_16( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
- /* i_non_zero_count if available else 0x80 */
- ALIGNED_16( uint8_t non_zero_count[X264_SCAN8_SIZE] );
+ /* i_non_zero_count if available else 0x80. intentionally misaligned by 8 for asm */
+ ALIGNED_8( uint8_t non_zero_count[X264_SCAN8_SIZE] );
/* -1 if unused, -2 if unavailable */
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
diff --git a/common/cpu.c b/common/cpu.c
index 636a40c..f365482 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -47,8 +47,7 @@ const x264_cpu_name_t x264_cpu_names[] =
{
#if HAVE_MMX
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
-// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it
-#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
+#define MMX2 X264_CPU_MMX|X264_CPU_MMX2
{"MMX2", MMX2},
{"MMXEXT", MMX2},
{"SSE", MMX2|X264_CPU_SSE},
@@ -56,6 +55,7 @@ const x264_cpu_name_t x264_cpu_names[] =
{"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", SSE2},
{"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST},
+ {"LZCNT", SSE2|X264_CPU_LZCNT},
{"SSE3", SSE2|X264_CPU_SSE3},
{"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
@@ -66,16 +66,17 @@ const x264_cpu_name_t x264_cpu_names[] =
{"XOP", AVX|X264_CPU_XOP},
{"FMA4", AVX|X264_CPU_FMA4},
{"FMA3", AVX|X264_CPU_FMA3},
- {"AVX2", AVX|X264_CPU_FMA3|X264_CPU_AVX2},
+ {"BMI1", AVX|X264_CPU_LZCNT|X264_CPU_BMI1},
+ {"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2},
+#define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2
+ {"AVX2", AVX2},
+ {"AVX512", AVX2|X264_CPU_AVX512},
+#undef AVX2
#undef AVX
#undef SSE2
#undef MMX2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
- {"LZCNT", X264_CPU_LZCNT},
- {"BMI1", X264_CPU_BMI1},
- {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
- {"SlowCTZ", X264_CPU_SLOW_CTZ},
{"SlowAtom", X264_CPU_SLOW_ATOM},
{"SlowPshufb", X264_CPU_SLOW_PSHUFB},
{"SlowPalignr", X264_CPU_SLOW_PALIGNR},
@@ -118,7 +119,7 @@ static void sigill_handler( int sig )
#if HAVE_MMX
int x264_cpu_cpuid_test( void );
void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
-void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx );
+uint64_t x264_cpu_xgetbv( int xcr );
uint32_t x264_cpu_detect( void )
{
@@ -126,15 +127,13 @@ uint32_t x264_cpu_detect( void )
uint32_t eax, ebx, ecx, edx;
uint32_t vendor[4] = {0};
uint32_t max_extended_cap, max_basic_cap;
- int cache;
#if !ARCH_X86_64
if( !x264_cpu_cpuid_test() )
return 0;
#endif
- x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
- max_basic_cap = eax;
+ x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 );
if( max_basic_cap == 0 )
return 0;
@@ -145,50 +144,46 @@ uint32_t x264_cpu_detect( void )
return cpu;
if( edx&0x02000000 )
cpu |= X264_CPU_MMX2|X264_CPU_SSE;
- if( edx&0x00008000 )
- cpu |= X264_CPU_CMOV;
- else
- return cpu;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
if( ecx&0x00000001 )
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
- cpu |= X264_CPU_SSSE3;
+ cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST;
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
if( ecx&0x00100000 )
cpu |= X264_CPU_SSE42;
- /* Check OXSAVE and AVX bits */
- if( (ecx&0x18000000) == 0x18000000 )
+
+ if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */
{
- /* Check for OS support */
- x264_cpu_xgetbv( 0, &eax, &edx );
- if( (eax&0x6) == 0x6 )
+ uint64_t xcr0 = x264_cpu_xgetbv( 0 );
+ if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
{
- cpu |= X264_CPU_AVX;
+ if( ecx&0x10000000 )
+ cpu |= X264_CPU_AVX;
if( ecx&0x00001000 )
cpu |= X264_CPU_FMA3;
- }
- }
- if( max_basic_cap >= 7 )
- {
- x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
- /* AVX2 requires OS support, but BMI1/2 don't. */
- if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
- cpu |= X264_CPU_AVX2;
- if( ebx&0x00000008 )
- {
- cpu |= X264_CPU_BMI1;
- if( ebx&0x00000100 )
- cpu |= X264_CPU_BMI2;
+ if( max_basic_cap >= 7 )
+ {
+ x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
+ if( ebx&0x00000008 )
+ cpu |= X264_CPU_BMI1;
+ if( ebx&0x00000100 )
+ cpu |= X264_CPU_BMI2;
+ if( ebx&0x00000020 )
+ cpu |= X264_CPU_AVX2;
+
+ if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */
+ {
+ if( (ebx&0xD0030000) == 0xD0030000 )
+ cpu |= X264_CPU_AVX512;
+ }
+ }
}
}
- if( cpu & X264_CPU_SSSE3 )
- cpu |= X264_CPU_SSE2_IS_FAST;
-
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
@@ -228,8 +223,6 @@ uint32_t x264_cpu_detect( void )
{
if( edx&0x00400000 )
cpu |= X264_CPU_MMX2;
- if( !(cpu&X264_CPU_LZCNT) )
- cpu |= X264_CPU_SLOW_CTZ;
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
}
@@ -254,7 +247,6 @@ uint32_t x264_cpu_detect( void )
else if( model == 28 )
{
cpu |= X264_CPU_SLOW_ATOM;
- cpu |= X264_CPU_SLOW_CTZ;
cpu |= X264_CPU_SLOW_PSHUFB;
}
/* Conroe has a slow shuffle unit. Check the model number to make sure not
@@ -268,7 +260,7 @@ uint32_t x264_cpu_detect( void )
{
/* cacheline size is specified in 3 places, any of which may be missing */
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
- cache = (ebx&0xff00)>>5; // cflush size
+ int cache = (ebx&0xff00)>>5; // cflush size
if( !cache && max_extended_cap >= 0x80000006 )
{
x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
diff --git a/common/cpu.h b/common/cpu.h
index eec1be2..845034c 100644
--- a/common/cpu.h
+++ b/common/cpu.h
@@ -56,7 +56,7 @@ void x264_cpu_sfence( void );
* alignment between functions (osdep.h handles manual alignment of arrays
* if it doesn't).
*/
-#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX
+#if HAVE_MMX && (STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4))
intptr_t x264_stack_align( void (*func)(), ... );
#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
#else
@@ -65,7 +65,7 @@ intptr_t x264_stack_align( void (*func)(), ... );
typedef struct
{
- const char name[16];
+ const char *name;
uint32_t flags;
} x264_cpu_name_t;
extern const x264_cpu_name_t x264_cpu_names[];
diff --git a/common/dct.c b/common/dct.c
index a270c4c..70853bf 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -711,6 +711,16 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
#endif
}
+
+ if( cpu&X264_CPU_AVX512 )
+ {
+ dctf->sub4x4_dct = x264_sub4x4_dct_avx512;
+ dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
+ dctf->sub16x16_dct = x264_sub16x16_dct_avx512;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512;
+ dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512;
+ dctf->add8x8_idct = x264_add8x8_idct_avx512;
+ }
#endif //HAVE_MMX
#if HAVE_ALTIVEC
@@ -986,6 +996,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
}
#endif // ARCH_X86_64
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512;
+ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
+ pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512;
+ pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
+ }
#endif // HAVE_MMX
#else
#if HAVE_MMX
@@ -1026,6 +1043,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
}
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512;
+ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
+ pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512;
+ pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
+ }
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
@@ -1068,6 +1092,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
}
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf_interlaced->interleave_8x8_cavlc =
+ pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
+ }
#else
if( cpu&X264_CPU_MMX )
{
@@ -1091,6 +1120,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
}
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf_interlaced->interleave_8x8_cavlc =
+ pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
+ }
#endif // HIGH_BIT_DEPTH
#endif
#if !HIGH_BIT_DEPTH
diff --git a/common/deblock.c b/common/deblock.c
index 659fb35..0c7f128 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -676,21 +676,21 @@ void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, i
void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
-void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
- int mvy_limit, int bframe );
-void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
- int mvy_limit, int bframe );
-void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
- int mvy_limit, int bframe );
-void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
- int mvy_limit, int bframe );
-void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
- int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
- int mvy_limit, int bframe );
+void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+ int mvy_limit, int bframe );
+void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+ int mvy_limit, int bframe );
+void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+ int mvy_limit, int bframe );
+void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+ int mvy_limit, int bframe );
+void x264_deblock_strength_avx512( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+ int mvy_limit, int bframe );
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
@@ -803,7 +803,6 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
#if !HIGH_BIT_DEPTH
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
#endif
- pf->deblock_strength = x264_deblock_strength_mmx2;
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
@@ -852,6 +851,10 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
{
pf->deblock_strength = x264_deblock_strength_avx2;
}
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf->deblock_strength = x264_deblock_strength_avx512;
+ }
}
#endif
diff --git a/common/frame.c b/common/frame.c
index e15c1a3..4d80cbb 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -54,6 +54,8 @@ static int x264_frame_internal_csp( int external_csp )
case X264_CSP_NV16:
case X264_CSP_I422:
case X264_CSP_YV16:
+ case X264_CSP_YUYV:
+ case X264_CSP_UYVY:
case X264_CSP_V210:
return X264_CSP_NV16;
case X264_CSP_I444:
@@ -76,7 +78,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
int i_padv = PADV << PARAM_INTERLACED;
int align = 16;
#if ARCH_X86 || ARCH_X86_64
- if( h->param.cpu&X264_CPU_CACHELINE_64 )
+ if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 )
align = 64;
else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
align = 32;
@@ -221,11 +223,13 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
}
- PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
+ PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
- PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
+ PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) );
+ /* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */
+ prealloc_size += NATIVE_ALIGN;
}
if( h->param.rc.i_aq_mode )
{
@@ -408,7 +412,13 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
uint8_t *pix[3];
int stride[3];
- if( i_csp == X264_CSP_V210 )
+ if( i_csp == X264_CSP_YUYV || i_csp == X264_CSP_UYVY )
+ {
+ int p = i_csp == X264_CSP_UYVY;
+ h->mc.plane_copy_deinterleave_yuyv( dst->plane[p], dst->i_stride[p], dst->plane[p^1], dst->i_stride[p^1],
+ (pixel*)src->img.plane[0], src->img.i_stride[0], h->param.i_width, h->param.i_height );
+ }
+ else if( i_csp == X264_CSP_V210 )
{
stride[0] = src->img.i_stride[0];
pix[0] = src->img.plane[0];
diff --git a/common/macroblock.c b/common/macroblock.c
index e5097a6..6168671 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -260,7 +260,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
- PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
+ PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint32_t) );
/* 0 -> 3 top(4), 4 -> 6 : left(3) */
PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
@@ -326,7 +326,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
PREALLOC_END( h->mb.base );
- memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
+ memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint32_t) );
for( int i = 0; i < 2; i++ )
{
@@ -388,7 +388,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
}
- int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
+ int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+15)&~15) * sizeof(int16_t);
scratch_size = X264_MAX( scratch_size, buf_mbtree );
if( scratch_size )
CHECKED_MALLOC( h->scratch_buffer, scratch_size );
@@ -532,16 +532,16 @@ void x264_macroblock_thread_init( x264_t *h )
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
- h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
+ h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
if( CHROMA444 )
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
- h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
+ h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE;
}
else
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
- h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
+ h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16;
}
}
@@ -1738,7 +1738,7 @@ void x264_macroblock_cache_save( x264_t *h )
h->mb.i_last_dqp = 0;
h->mb.i_cbp_chroma = CHROMA444 ? 0 : 2;
h->mb.i_cbp_luma = 0xf;
- h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x700;
+ h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x1700;
h->mb.b_transform_8x8 = 0;
for( int i = 0; i < 48; i++ )
h->mb.cache.non_zero_count[x264_scan8[i]] = h->param.b_cabac ? 1 : 16;
diff --git a/common/mc.c b/common/mc.c
index 156890d..65af5b9 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -325,15 +325,14 @@ void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
}
}
-static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu,
- pixel *dstv, intptr_t i_dstv,
- pixel *src, intptr_t i_src, int w, int h )
+void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
+ pixel *src, intptr_t i_src, int w, int h )
{
- for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
+ for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, src+=i_src )
for( int x=0; x<w; x++ )
{
- dstu[x] = src[2*x];
- dstv[x] = src[2*x+1];
+ dsta[x] = src[2*x];
+ dstb[x] = src[2*x+1];
}
}
@@ -649,6 +648,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
pf->plane_copy_swap = x264_plane_copy_swap_c;
pf->plane_copy_interleave = x264_plane_copy_interleave_c;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
+ pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_c;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;
diff --git a/common/mc.h b/common/mc.h
index 8f9a772..f3e7079 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -160,6 +160,39 @@ static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src,
x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
}
+void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
+ pixel *src, intptr_t i_src, int w, int h );
+
+/* We can utilize existing plane_copy_deinterleave() functions for YUYV/UYUV
+ * input with the additional constraint that we cannot overread src. */
+#define PLANE_COPY_YUYV(align, cpu)\
+static void x264_plane_copy_deinterleave_yuyv_##cpu( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,\
+ pixel *src, intptr_t i_src, int w, int h )\
+{\
+ int c_w = (align>>1) / sizeof(pixel) - 1;\
+ if( !(w&c_w) )\
+ x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
+ else if( w > c_w )\
+ {\
+ if( --h > 0 )\
+ {\
+ if( i_src > 0 )\
+ {\
+ x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
+ dsta += i_dsta * h;\
+ dstb += i_dstb * h;\
+ src += i_src * h;\
+ }\
+ else\
+ x264_plane_copy_deinterleave_##cpu( dsta+i_dsta, i_dsta, dstb+i_dstb, i_dstb,\
+ src+i_src, i_src, w, h );\
+ }\
+ x264_plane_copy_deinterleave_c( dsta, 0, dstb, 0, src, 0, w, 1 );\
+ }\
+ else\
+ x264_plane_copy_deinterleave_c( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
+}
+
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
@@ -260,6 +293,8 @@ typedef struct
/* may write up to 15 pixels off the end of each plane */
void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
+ void (*plane_copy_deinterleave_yuyv)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
+ pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h );
void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
diff --git a/common/osdep.h b/common/osdep.h
index 3ff86fc..ca2455d 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -139,17 +139,23 @@ int x264_is_pipe( const char *path );
#define EXPAND(x) x
#if ARCH_X86 || ARCH_X86_64
-#define NATIVE_ALIGN 32
+#define NATIVE_ALIGN 64
#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
+#define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 )
#if STACK_ALIGNMENT >= 32
#define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
#endif
+#if STACK_ALIGNMENT >= 64
+#define ALIGNED_ARRAY_64( type, name, sub1, ... ) ALIGNED_64( type name sub1 __VA_ARGS__ )
+#else
#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
+#endif
#else
#define NATIVE_ALIGN 16
#define ALIGNED_32 ALIGNED_16
+#define ALIGNED_64 ALIGNED_16
#define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16
#define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16
#endif
diff --git a/common/pixel.c b/common/pixel.c
index c5edc9e..d668491 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -201,28 +201,32 @@ PIXEL_VAR_C( x264_pixel_var_8x8, 8, 8 )
/****************************************************************************
* pixel_var2_wxh
****************************************************************************/
-#define PIXEL_VAR2_C( name, w, h, shift ) \
-static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \
+#define PIXEL_VAR2_C( name, h, shift ) \
+static int name( pixel *fenc, pixel *fdec, int ssd[2] ) \
{ \
- int var = 0, sum = 0, sqr = 0; \
+ int sum_u = 0, sum_v = 0, sqr_u = 0, sqr_v = 0; \
for( int y = 0; y < h; y++ ) \
{ \
- for( int x = 0; x < w; x++ ) \
+ for( int x = 0; x < 8; x++ ) \
{ \
- int diff = pix1[x] - pix2[x]; \
- sum += diff; \
- sqr += diff * diff; \
+ int diff_u = fenc[x] - fdec[x]; \
+ int diff_v = fenc[x+FENC_STRIDE/2] - fdec[x+FDEC_STRIDE/2]; \
+ sum_u += diff_u; \
+ sum_v += diff_v; \
+ sqr_u += diff_u * diff_u; \
+ sqr_v += diff_v * diff_v; \
} \
- pix1 += i_stride1; \
- pix2 += i_stride2; \
+ fenc += FENC_STRIDE; \
+ fdec += FDEC_STRIDE; \
} \
- var = sqr - ((int64_t)sum * sum >> shift); \
- *ssd = sqr; \
- return var; \
+ ssd[0] = sqr_u; \
+ ssd[1] = sqr_v; \
+ return sqr_u - ((int64_t)sum_u * sum_u >> shift) + \
+ sqr_v - ((int64_t)sum_v * sum_v >> shift); \
}
-PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16, 7 )
-PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 8, 6 )
+PIXEL_VAR2_C( x264_pixel_var2_8x16, 16, 7 )
+PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 6 )
#if BIT_DEPTH > 8
typedef uint32_t sum_t;
@@ -885,13 +889,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
- pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
- pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
-#if ARCH_X86
- pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
- pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
-#endif
-
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
@@ -962,7 +959,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( sad, _ssse3 );
INIT7( sad_x3, _ssse3 );
INIT7( sad_x4, _ssse3 );
+#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _ssse3 );
+#endif
INIT6( satd, _ssse3 );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
@@ -1003,7 +1002,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_AVX )
{
INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
+#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx );
+#endif
INIT6( satd, _avx );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
@@ -1028,8 +1029,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5( sad_x3, _xop );
INIT5( sad_x4, _xop );
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
- pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
- pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->vsad = x264_pixel_vsad_xop;
pixf->asd8 = x264_pixel_asd8_xop;
#if ARCH_X86_64
@@ -1044,10 +1043,19 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _avx2 );
INIT2( sad_x4, _avx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
pixf->vsad = x264_pixel_vsad_avx2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
}
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
@@ -1067,16 +1075,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT_ADS( _mmx2 );
- pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
- pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2;
- pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
- pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
- pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
pixf->vsad = x264_pixel_vsad_mmx2;
if( cpu&X264_CPU_CACHELINE_32 )
@@ -1197,7 +1200,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3;
#endif
}
+#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _ssse3 );
+#endif
if( cpu&X264_CPU_SLOW_ATOM )
{
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
@@ -1280,7 +1285,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
+#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx );
+#endif
INIT4( hadamard_ac, _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
@@ -1321,11 +1328,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
- pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
- pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
- pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
- pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
- pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
#endif
@@ -1338,7 +1340,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _avx2 );
INIT4( satd, _avx2 );
INIT2( hadamard_ac, _avx2 );
+#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx2 );
+#endif
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
@@ -1351,6 +1355,21 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
#endif
}
+
+ if( cpu&X264_CPU_AVX512 )
+ {
+ INIT8( sad, _avx512 );
+ INIT8_NAME( sad_aligned, sad, _avx512 );
+ INIT7( sad_x3, _avx512 );
+ INIT7( sad_x4, _avx512 );
+ INIT8( satd, _avx512 );
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512;
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
+ }
#endif //HAVE_MMX
#if HAVE_ARMV6
@@ -1480,10 +1499,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_msa;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_msa;
- pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa;
- pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa;
- pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
- pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8;
+ //pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa;
+ //pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa;
+ pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_msa;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_msa;
}
#endif // HAVE_MSA
diff --git a/common/pixel.h b/common/pixel.h
index f634312..d4dbfaf 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -93,8 +93,7 @@ typedef struct
uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t (*var[4])( pixel *pix, intptr_t stride );
- int (*var2[4])( pixel *pix1, intptr_t stride1,
- pixel *pix2, intptr_t stride2, int *ssd );
+ int (*var2[4])( pixel *fenc, pixel *fdec, int ssd[2] );
uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride );
void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1,
diff --git a/common/ppc/dct.c b/common/ppc/dct.c
index 768f390..2858bd0 100644
--- a/common/ppc/dct.c
+++ b/common/ppc/dct.c
@@ -293,12 +293,8 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix
vec_vsx_st( dcvsum8, 0, dest ); \
}
-static void idct8_dc_altivec( uint8_t *dst, int16_t dc1, int16_t dc2 )
+static void idct8_dc_altivec( uint8_t *dst, vec_s16_t dcv )
{
- dc1 = (dc1 + 32) >> 6;
- dc2 = (dc2 + 32) >> 6;
- vec_s16_t dcv = { dc1, dc1, dc1, dc1, dc2, dc2, dc2, dc2 };
-
LOAD_ZERO;
ALTIVEC_STORE8_DC_SUM_CLIP( &dst[0*FDEC_STRIDE], dcv );
ALTIVEC_STORE8_DC_SUM_CLIP( &dst[1*FDEC_STRIDE], dcv );
@@ -308,8 +304,18 @@ static void idct8_dc_altivec( uint8_t *dst, int16_t dc1, int16_t dc2 )
void x264_add8x8_idct_dc_altivec( uint8_t *p_dst, int16_t dct[4] )
{
- idct8_dc_altivec( &p_dst[0], dct[0], dct[1] );
- idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2], dct[3] );
+ vec_s16_t dcv;
+ vec_s16_t v32 = vec_sl( vec_splat_s16( 8 ), vec_splat_u16( 2 ) );
+ vec_u16_t v6 = vec_splat_u16( 6 );
+ vec_s16_t dctv = vec_vsx_ld( 0, dct );
+
+ dctv = vec_sra( vec_add( dctv, v32 ), v6 );
+ dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 0 ), (vec_s32_t)vec_splat( dctv, 1 ) );
+ dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv );
+ idct8_dc_altivec( &p_dst[0], dcv );
+ dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 2 ), (vec_s32_t)vec_splat( dctv, 3 ) );
+ dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv );
+ idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dcv );
}
#define IDCT_1D_ALTIVEC(s0, s1, s2, s3, d0, d1, d2, d3) \
diff --git a/common/quant.c b/common/quant.c
index 7eef140..ae96222 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -460,9 +460,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
#if ARCH_X86
pf->denoise_dct = x264_denoise_dct_mmx;
- pf->decimate_score15 = x264_decimate_score15_mmx2;
- pf->decimate_score16 = x264_decimate_score16_mmx2;
- pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
@@ -473,8 +470,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#endif
pf->coeff_last4 = x264_coeff_last4_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
- if( cpu&X264_CPU_LZCNT )
- pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
}
if( cpu&X264_CPU_SSE2 )
{
@@ -499,17 +494,18 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
- if( cpu&X264_CPU_LZCNT )
- {
- pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
- pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
- pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
- pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
- pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
- pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
- pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
- pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
- }
+ }
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->coeff_last4 = x264_coeff_last4_lzcnt;
+ pf->coeff_last8 = x264_coeff_last8_lzcnt;
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt;
+ pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt;
+ pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt;
+ pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt;
}
if( cpu&X264_CPU_SSSE3 )
{
@@ -557,8 +553,20 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_8x8 = x264_dequant_8x8_avx2;
pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
- if( cpu&X264_CPU_LZCNT )
- pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
+ }
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf->dequant_4x4 = x264_dequant_4x4_avx512;
+ pf->dequant_8x8 = x264_dequant_8x8_avx512;
+ pf->decimate_score15 = x264_decimate_score15_avx512;
+ pf->decimate_score16 = x264_decimate_score16_avx512;
+ pf->decimate_score64 = x264_decimate_score64_avx512;
+ pf->coeff_last4 = x264_coeff_last4_avx512;
+ pf->coeff_last8 = x264_coeff_last8_avx512;
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
+ pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
@@ -586,9 +594,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_mmx2;
pf->quant_8x8 = x264_quant_8x8_mmx2;
pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
- pf->decimate_score15 = x264_decimate_score15_mmx2;
- pf->decimate_score16 = x264_decimate_score16_mmx2;
- pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
@@ -599,13 +604,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
- if( cpu&X264_CPU_LZCNT )
- {
- pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
- pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
- pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
- pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
- }
}
if( cpu&X264_CPU_SSE2 )
@@ -634,14 +632,19 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
- if( cpu&X264_CPU_LZCNT )
- {
- pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
- pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
- pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
- pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
- pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
- }
+ }
+
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->coeff_last4 = x264_coeff_last4_lzcnt;
+ pf->coeff_last8 = x264_coeff_last8_lzcnt;
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt;
+ pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt;
+ pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt;
+ pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt;
}
if( cpu&X264_CPU_SSSE3 )
@@ -657,17 +660,19 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score16 = x264_decimate_score16_ssse3;
pf->decimate_score64 = x264_decimate_score64_ssse3;
INIT_TRELLIS( ssse3 );
+#if ARCH_X86 || !defined( __MACH__ )
pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
if( cpu&X264_CPU_LZCNT )
{
- pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
- pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
+ pf->coeff_level_run4 = x264_coeff_level_run4_ssse3_lzcnt;
+ pf->coeff_level_run8 = x264_coeff_level_run8_ssse3_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
}
+#endif
}
if( cpu&X264_CPU_SSE4 )
@@ -717,12 +722,28 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
}
pf->decimate_score64 = x264_decimate_score64_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
- if( cpu&X264_CPU_LZCNT )
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
+#if ARCH_X86 || !defined( __MACH__ )
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2;
+#endif
+ }
+ if( cpu&X264_CPU_AVX512 )
+ {
+ if( h->param.i_cqm_preset == X264_CQM_FLAT )
+ pf->dequant_8x8 = x264_dequant_8x8_flat16_avx512;
+ else
{
- pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
- pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
- pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
+ pf->dequant_4x4 = x264_dequant_4x4_avx512;
+ pf->dequant_8x8 = x264_dequant_8x8_avx512;
}
+ pf->decimate_score15 = x264_decimate_score15_avx512;
+ pf->decimate_score16 = x264_decimate_score16_avx512;
+ pf->decimate_score64 = x264_decimate_score64_avx512;
+ pf->coeff_last8 = x264_coeff_last8_avx512;
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
+ pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
}
#endif // HAVE_MMX
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index d7870a3..3644fd5 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -53,21 +53,32 @@ coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
%endmacro
cextern coeff_last4_mmx2
-cextern coeff_last4_mmx2_lzcnt
+cextern coeff_last4_lzcnt
+%if HIGH_BIT_DEPTH
+cextern coeff_last4_avx512
+%endif
cextern coeff_last15_sse2
-cextern coeff_last15_sse2_lzcnt
+cextern coeff_last15_lzcnt
+cextern coeff_last15_avx512
cextern coeff_last16_sse2
-cextern coeff_last16_sse2_lzcnt
+cextern coeff_last16_lzcnt
+cextern coeff_last16_avx512
cextern coeff_last64_sse2
-cextern coeff_last64_sse2_lzcnt
-cextern coeff_last64_avx2_lzcnt
+cextern coeff_last64_lzcnt
+cextern coeff_last64_avx2
+cextern coeff_last64_avx512
%ifdef PIC
SECTION .data
%endif
-coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
-coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
-coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_lzcnt: COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_avx2: COEFF_LAST_TABLE lzcnt, avx2, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%if HIGH_BIT_DEPTH
+coeff_last_avx512: COEFF_LAST_TABLE avx512, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%else
+coeff_last_avx512: COEFF_LAST_TABLE lzcnt, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%endif
%endif
SECTION .text
@@ -100,7 +111,7 @@ struc cb
.start: pointer 1
.p: pointer 1
.end: pointer 1
- align 16, resb 1
+ align 64, resb 1
.bits_encoded: resd 1
.state: resb 1024
endstruc
@@ -352,25 +363,33 @@ CABAC bmi2
%endmacro
%macro ABS_DCTCOEFS 2
-%assign i 0
-%rep %2/16
%if HIGH_BIT_DEPTH
- ABSD m0, [%1+ 0+i*64], m4
- ABSD m1, [%1+16+i*64], m5
- ABSD m2, [%1+32+i*64], m4
- ABSD m3, [%1+48+i*64], m5
- mova [rsp+ 0+i*64], m0
- mova [rsp+16+i*64], m1
- mova [rsp+32+i*64], m2
- mova [rsp+48+i*64], m3
+ %define %%abs ABSD
%else
- ABSW m0, [%1+ 0+i*32], m2
- ABSW m1, [%1+16+i*32], m3
- mova [rsp+ 0+i*32], m0
- mova [rsp+16+i*32], m1
-%endif
+ %define %%abs ABSW
+%endif
+%if mmsize == %2*SIZEOF_DCTCOEF
+ %%abs m0, [%1], m1
+ mova [rsp], m0
+%elif mmsize == %2*SIZEOF_DCTCOEF/2
+ %%abs m0, [%1+0*mmsize], m2
+ %%abs m1, [%1+1*mmsize], m3
+ mova [rsp+0*mmsize], m0
+ mova [rsp+1*mmsize], m1
+%else
+%assign i 0
+%rep %2*SIZEOF_DCTCOEF/(4*mmsize)
+ %%abs m0, [%1+(4*i+0)*mmsize], m4
+ %%abs m1, [%1+(4*i+1)*mmsize], m5
+ %%abs m2, [%1+(4*i+2)*mmsize], m4
+ %%abs m3, [%1+(4*i+3)*mmsize], m5
+ mova [rsp+(4*i+0)*mmsize], m0
+ mova [rsp+(4*i+1)*mmsize], m1
+ mova [rsp+(4*i+2)*mmsize], m2
+ mova [rsp+(4*i+3)*mmsize], m3
%assign i i+1
%endrep
+%endif
%endmacro
%macro SIG_OFFSET 1
@@ -403,16 +422,14 @@ CABAC bmi2
%endif
%ifdef PIC
- cglobal func, 4,13
+ cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
lea r12, [$$]
%define GLOBAL +r12-$$
%else
- cglobal func, 4,12
+ cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF
%define GLOBAL
%endif
-%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
- SUB rsp, pad
shl r1d, 4 ; MB_INTERLACED*16
%if %1
lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
@@ -429,15 +446,13 @@ CABAC bmi2
ABS_DCTCOEFS r0, 64
%else
mov r4, r0 ; r4 = dct
- mov r6, ~SIZEOF_DCTCOEF
- and r6, r4 ; handle AC coefficient case
- ABS_DCTCOEFS r6, 16
- sub r4, r6 ; calculate our new dct pointer
+ and r4, ~SIZEOF_DCTCOEF ; handle AC coefficient case
+ ABS_DCTCOEFS r4, 16
+ xor r4, r0 ; calculate our new dct pointer
add r4, rsp ; restore AC coefficient offset
%endif
- mov r1, [%2+gprsize*r2 GLOBAL]
; for improved OOE performance, run coeff_last on the original coefficients.
- call r1 ; coeff_last[ctx_block_cat]( dct )
+ call [%2+gprsize*r2 GLOBAL] ; coeff_last[ctx_block_cat]( dct )
; we know on 64-bit that the SSE2 versions of this function only
; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
; don't need r2 in 8x8 mode.
@@ -521,7 +536,6 @@ CABAC bmi2
jge .coeff_loop
.end:
mov [r3+cb.bits_encoded-cb.state], r0d
- ADD rsp, pad
RET
%endmacro
@@ -529,15 +543,23 @@ CABAC bmi2
INIT_XMM sse2
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
-INIT_XMM sse2,lzcnt
-CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
-CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
+INIT_XMM lzcnt
+CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
+CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
INIT_XMM ssse3
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
INIT_XMM ssse3,lzcnt
-CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
-CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
+CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
+CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+%else
+INIT_YMM avx512
+%endif
+CABAC_RESIDUAL_RD 0, coeff_last_avx512
+INIT_ZMM avx512
+CABAC_RESIDUAL_RD 1, coeff_last_avx512
%endif
;-----------------------------------------------------------------------------
@@ -615,7 +637,7 @@ CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
%endmacro
%macro CABAC_RESIDUAL 1
-cglobal cabac_block_residual_internal, 4,15
+cglobal cabac_block_residual_internal, 4,15,0,-4*64
%ifdef PIC
; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
lea r7, [$$]
@@ -625,8 +647,6 @@ cglobal cabac_block_residual_internal, 4,15
%define lastm r7d
%define GLOBAL
%endif
-%assign pad gprsize+4*2+4*64-(stack_offset&15)
- SUB rsp, pad
shl r1d, 4
%define sigoffq r8
@@ -653,8 +673,7 @@ cglobal cabac_block_residual_internal, 4,15
mov dct, r0
mov leveloffm, leveloffd
- mov r1, [%1+gprsize*r2 GLOBAL]
- call r1
+ call [%1+gprsize*r2 GLOBAL]
mov lastm, eax
; put cabac in r0; needed for cabac_encode_decision
mov r0, r3
@@ -718,14 +737,14 @@ cglobal cabac_block_residual_internal, 4,15
push r7
push r8
%else
- sub rsp, 32 ; shadow space
+ sub rsp, 40 ; shadow space and alignment
%endif
call cabac_encode_ue_bypass
%if UNIX64
pop r8
pop r7
%else
- add rsp, 32
+ add rsp, 40
%endif
pop r0
.level_gt1_end:
@@ -742,15 +761,16 @@ cglobal cabac_block_residual_internal, 4,15
%endif
dec coeffidxd
jge .level_loop
- ADD rsp, pad
RET
%endmacro
%if ARCH_X86_64
INIT_XMM sse2
CABAC_RESIDUAL coeff_last_sse2
-INIT_XMM sse2,lzcnt
-CABAC_RESIDUAL coeff_last_sse2_lzcnt
-INIT_XMM avx2,bmi2
-CABAC_RESIDUAL coeff_last_avx2_lzcnt
+INIT_XMM lzcnt
+CABAC_RESIDUAL coeff_last_lzcnt
+INIT_XMM avx2
+CABAC_RESIDUAL coeff_last_avx2
+INIT_XMM avx512
+CABAC_RESIDUAL coeff_last_avx512
%endif
diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm
index c961903..4692f65 100644
--- a/common/x86/cpu-a.asm
+++ b/common/x86/cpu-a.asm
@@ -53,18 +53,16 @@ cglobal cpu_cpuid, 5,7
RET
;-----------------------------------------------------------------------------
-; void cpu_xgetbv( int op, int *eax, int *edx )
+; uint64_t cpu_xgetbv( int xcr )
;-----------------------------------------------------------------------------
-cglobal cpu_xgetbv, 3,7
- push r2
- push r1
- mov ecx, r0d
+cglobal cpu_xgetbv
+ movifnidn ecx, r0m
xgetbv
- pop r4
- mov [r4], eax
- pop r4
- mov [r4], edx
- RET
+%if ARCH_X86_64
+ shl rdx, 32
+ or rax, rdx
+%endif
+ ret
%if ARCH_X86_64
@@ -77,7 +75,7 @@ cglobal stack_align
%if WIN64
sub rsp, 32 ; shadow space
%endif
- and rsp, ~31
+ and rsp, ~(STACK_ALIGNMENT-1)
mov rax, r0
mov r0, r1
mov r1, r2
@@ -118,7 +116,7 @@ cglobal stack_align
push ebp
mov ebp, esp
sub esp, 12
- and esp, ~31
+ and esp, ~(STACK_ALIGNMENT-1)
mov ecx, [ebp+8]
mov edx, [ebp+12]
mov [esp], edx
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index c2f8973..33ed061 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -30,7 +30,41 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA 32
+SECTION_RODATA 64
+; AVX-512 permutation indices are bit-packed to save cache
+%if HIGH_BIT_DEPTH
+scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: 4x4_frame
+ dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8: 8x8_frame1
+ dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13: 8x8_frame2
+ dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3
+ ; bits 19-23: 8x8_frame4
+scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4: 8x8_field1
+ dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9: 8x8_field2
+ dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3
+ dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4
+cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4: interleave1
+ dd 0x0009ca30, 0x000bdab4, 0x000deb38, 0x000ffbbc ; bits 5-9: interleave2
+ dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
+ dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
+%else
+dct_avx512: dd 0x10000000, 0x00021104, 0x3206314c, 0x60042048 ; bits 0-4: dct8x8_fenc bits 5-9: dct8x8_fdec
+ dd 0x98008a10, 0x20029b14, 0xba06bb5c, 0x4004aa58 ; bits 10-13: dct16x16_fenc bits 14-18: dct16x16_fdec
+ dd 0x54004421, 0x80025525, 0x7606756d, 0xe0046469 ; bits(e) 24-27: idct8x8_idct1 bits(e) 28-31: idct8x8_idct2
+ dd 0xdc00ce31, 0xa002df35, 0xfe06ff7d, 0xc004ee79 ; bits(o) 24-31: idct8x8_gather
+scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
+ dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
+ dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
+ dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30
+scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5: 8x8_field1
+ dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11: 8x8_field2
+ dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a
+ dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2
+cavlc_shuf_avx512: dw 0x0080, 0x0184, 0x0288, 0x038c, 0x0490, 0x0594, 0x0698, 0x079c ; bits 0-5: interleave1
+ dw 0x08a0, 0x09a4, 0x0aa8, 0x0bac, 0x0cb0, 0x0db4, 0x0eb8, 0x0fbc ; bits 6-11: interleave2
+ dw 0x00c1, 0x01c5, 0x02c9, 0x03cd, 0x04d1, 0x05d5, 0x06d9, 0x07dd
+ dw 0x08e1, 0x09e5, 0x0ae9, 0x0bed, 0x0cf1, 0x0df5, 0x0ef9, 0x0ffd
+%endif
+
pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
@@ -580,6 +614,217 @@ cglobal sub16x16_dct, 3,3,6
DCT4_1D 0, 1, 2, 3, 4
STORE16_DCT_AVX2 0, 1, 2, 3, 4
ret
+
+%macro DCT4x4_AVX512 0
+ psubw m0, m2 ; 0 1
+ psubw m1, m3 ; 3 2
+ SUMSUB_BA w, 1, 0, 2
+ SBUTTERFLY wd, 1, 0, 2
+ paddw m2, m1, m0
+ psubw m3, m1, m0
+ paddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1
+ psubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3
+ shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2
+ punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1
+ SUMSUB_BA w, 1, 2, 3
+ shufps m3, m1, m2, q3131 ; a1+a2 b1+b2 c1+c2 d1+d2 a1-a2 b1-b2 b1-b2 d1-d2
+ shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3
+ paddw m2, m1, m3
+ psubw m0, m1, m3
+ paddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1
+ psubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3'
+%endmacro
+
+INIT_XMM avx512
+cglobal sub4x4_dct
+ mov eax, 0xf0aa
+ kmovw k1, eax
+ PROLOGUE 3,3
+ movd m0, [r1+0*FENC_STRIDE]
+ movd m2, [r2+0*FDEC_STRIDE]
+ vpbroadcastd m0 {k1}, [r1+1*FENC_STRIDE]
+ vpbroadcastd m2 {k1}, [r2+1*FDEC_STRIDE]
+ movd m1, [r1+3*FENC_STRIDE]
+ movd m3, [r2+3*FDEC_STRIDE]
+ vpbroadcastd m1 {k1}, [r1+2*FENC_STRIDE]
+ vpbroadcastd m3 {k1}, [r2+2*FDEC_STRIDE]
+ kshiftrw k2, k1, 8
+ pxor m4, m4
+ punpcklbw m0, m4
+ punpcklbw m2, m4
+ punpcklbw m1, m4
+ punpcklbw m3, m4
+ DCT4x4_AVX512
+ mova [r0], m2
+ mova [r0+16], m0
+ RET
+
+INIT_ZMM avx512
+cglobal dct4x4x4_internal
+ punpcklbw m0, m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m1, m4
+ punpckhbw m3, m4
+ DCT4x4_AVX512
+ mova m1, m2
+ vshufi32x4 m2 {k2}, m0, m0, q2200 ; m0
+ vshufi32x4 m0 {k3}, m1, m1, q3311 ; m1
+ ret
+
+%macro DCT8x8_LOAD_FENC_AVX512 4 ; dst, perm, row1, row2
+ movu %1, [r1+%3*FENC_STRIDE]
+ vpermt2d %1, %2, [r1+%4*FENC_STRIDE]
+%endmacro
+
+%macro DCT8x8_LOAD_FDEC_AVX512 5 ; dst, perm, tmp, row1, row2
+ movu %1, [r2+(%4 )*FDEC_STRIDE]
+ vmovddup %1 {k1}, [r2+(%4+2)*FDEC_STRIDE]
+ movu %3, [r2+(%5 )*FDEC_STRIDE]
+ vmovddup %3 {k1}, [r2+(%5+2)*FDEC_STRIDE]
+ vpermt2d %1, %2, %3
+%endmacro
+
+cglobal sub8x8_dct, 3,3
+ mova m0, [dct_avx512]
+ DCT8x8_LOAD_FENC_AVX512 m1, m0, 0, 4 ; 0 2 1 3
+ mov r1d, 0xaaaaaaaa
+ kmovd k1, r1d
+ psrld m0, 5
+ DCT8x8_LOAD_FDEC_AVX512 m3, m0, m2, 0, 4
+ mov r1d, 0xf0f0f0f0
+ kmovd k2, r1d
+ pxor xm4, xm4
+ knotw k3, k2
+ call dct4x4x4_internal_avx512
+ mova [r0], m0
+ mova [r0+64], m1
+ RET
+
+%macro SUB4x16_DCT_AVX512 2 ; dst, src
+ vpermd m1, m5, [r1+1*%2*64]
+ mova m3, [r2+2*%2*64]
+ vpermt2d m3, m6, [r2+2*%2*64+64]
+ call dct4x4x4_internal_avx512
+ mova [r0+%1*64 ], m0
+ mova [r0+%1*64+128], m1
+%endmacro
+
+cglobal sub16x16_dct
+ psrld m5, [dct_avx512], 10
+ mov eax, 0xaaaaaaaa
+ kmovd k1, eax
+ mov eax, 0xf0f0f0f0
+ kmovd k2, eax
+ PROLOGUE 3,3
+ pxor xm4, xm4
+ knotw k3, k2
+ psrld m6, m5, 4
+ SUB4x16_DCT_AVX512 0, 0
+ SUB4x16_DCT_AVX512 1, 1
+ SUB4x16_DCT_AVX512 4, 2
+ SUB4x16_DCT_AVX512 5, 3
+ RET
+
+cglobal sub8x8_dct_dc, 3,3
+ mova m3, [dct_avx512]
+ DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3
+ mov r1d, 0xaa
+ kmovb k1, r1d
+ psrld m3, 5
+ DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4
+ pxor xm3, xm3
+ psadbw m0, m3
+ psadbw m1, m3
+ psubw m0, m1
+ vpmovqw xmm0, m0
+ vprold xmm1, xmm0, 16
+ paddw xmm0, xmm1 ; 0 0 2 2 1 1 3 3
+ punpckhqdq xmm2, xmm0, xmm0
+ psubw xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3
+ paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3
+ punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3
+ punpcklqdq xmm1, xmm0, xmm0
+ psubw xmm0 {k1}, xm3, xmm0
+ paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3
+ movhps [r0], xmm0
+ RET
+
+cglobal sub8x16_dct_dc, 3,3
+ mova m5, [dct_avx512]
+ DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8 ; 0 4 1 5
+ DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7
+ mov r1d, 0xaa
+ kmovb k1, r1d
+ psrld m5, 5
+ DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8
+ DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12
+ pxor xm4, xm4
+ psadbw m0, m4
+ psadbw m1, m4
+ psadbw m2, m4
+ psadbw m3, m4
+ psubw m0, m2
+ psubw m1, m3
+ SBUTTERFLY qdq, 0, 1, 2
+ paddw m0, m1
+ vpmovqw xmm0, m0 ; 0 2 4 6 1 3 5 7
+ psrlq xmm2, xmm0, 32
+ psubw xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7
+ paddw xmm0, xmm2 ; 0+4 2+6 1+5 3+7
+ punpckhdq xmm2, xmm0, xmm1
+ punpckldq xmm0, xmm1
+ psubw xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7
+ paddw xmm0, xmm2 ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7
+ punpcklwd xmm0, xmm1
+ psrlq xmm2, xmm0, 32
+ psubw xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7
+ paddw xmm0, xmm2 ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7
+ shufps xmm0, xmm1, q0220
+ mova [r0], xmm0
+ RET
+
+%macro SARSUMSUB 3 ; a, b, tmp
+ mova m%3, m%1
+ vpsraw m%1 {k1}, 1
+ psubw m%1, m%2 ; 0-2 1>>1-3
+ vpsraw m%2 {k1}, 1
+ paddw m%2, m%3 ; 0+2 1+3>>1
+%endmacro
+
+cglobal add8x8_idct, 2,2
+ mova m1, [r1]
+ mova m2, [r1+64]
+ mova m3, [dct_avx512]
+ vbroadcasti32x4 m4, [pw_32]
+ mov r1d, 0xf0f0f0f0
+ kxnorb k2, k2, k2
+ kmovd k1, r1d
+ kmovb k3, k2
+ vshufi32x4 m0, m1, m2, q2020 ; 0 1 4 5 8 9 c d
+ vshufi32x4 m1, m2, q3131 ; 2 3 6 7 a b e f
+ psrlq m5, m3, 56 ; {0, 3, 1, 2, 4, 7, 5, 6} * FDEC_STRIDE
+ vpgatherqq m6 {k2}, [r0+m5]
+ SARSUMSUB 0, 1, 2
+ SBUTTERFLY wd, 1, 0, 2
+ psrlq m7, m3, 28
+ SUMSUB_BA w, 0, 1, 2 ; 0+1+2+3>>1 0+1>>1-2-3
+ vprold m1, 16 ; 0-1>>1-2+3 0-1+2-3>>1
+ SBUTTERFLY dq, 0, 1, 2
+ psrlq m3, 24
+ SARSUMSUB 0, 1, 2
+ vpermi2q m3, m1, m0
+ vpermt2q m1, m7, m0
+ paddw m3, m4 ; += 32
+ SUMSUB_BA w, 1, 3, 0
+ psraw m1, 6 ; 0'+1'+2'+3'>>1 0'+1'>>1-2'-3'
+ psraw m3, 6 ; 0'-1'+2'-3'>>1 0'-1'>>1-2'+3'
+ pxor xm0, xm0
+ SBUTTERFLY bw, 6, 0, 2
+ paddsw m1, m6
+ paddsw m3, m0
+ packuswb m1, m3
+ vpscatterqq [r0+m5] {k3}, m1
+ RET
%endif ; HIGH_BIT_DEPTH
INIT_MMX
@@ -1883,3 +2128,161 @@ cglobal zigzag_interleave_8x8_cavlc, 3,3,6
mov [r2+8], r0w
RET
%endif ; !HIGH_BIT_DEPTH
+
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+cglobal zigzag_scan_4x4_frame, 2,2
+ mova m0, [scan_frame_avx512]
+ vpermd m0, m0, [r1]
+ mova [r0], m0
+ RET
+
+cglobal zigzag_scan_4x4_field, 2,2
+ mova m0, [r1]
+ pshufd xmm1, [r1+8], q3102
+ mova [r0], m0
+ movu [r0+8], xmm1
+ RET
+
+cglobal zigzag_scan_8x8_frame, 2,2
+ psrld m0, [scan_frame_avx512], 4
+ mova m1, [r1+0*64]
+ mova m2, [r1+1*64]
+ mova m3, [r1+2*64]
+ mova m4, [r1+3*64]
+ mov r1d, 0x01fe7f80
+ kmovd k1, r1d
+ kshiftrd k2, k1, 16
+ vpermd m5, m0, m3 ; __ __ __ __ __ __ __ __ __ __ __ __ __ __ 32 40
+ psrld m6, m0, 5
+ vpermi2d m0, m1, m2 ; 0 8 1 2 9 16 24 17 10 3 4 11 18 25 __ __
+ vmovdqa64 m0 {k1}, m5
+ mova [r0+0*64], m0
+ mova m5, m1
+ vpermt2d m1, m6, m2 ; __ 26 19 12 5 6 13 20 27 __ __ __ __ __ __ __
+ psrld m0, m6, 5
+ vpermi2d m6, m3, m4 ; 33 __ __ __ __ __ __ __ __ 34 41 48 56 49 42 35
+ vmovdqa32 m6 {k2}, m1
+ mova [r0+1*64], m6
+ vpermt2d m5, m0, m2 ; 28 21 14 7 15 22 29 __ __ __ __ __ __ __ __ 30
+ psrld m1, m0, 5
+ vpermi2d m0, m3, m4 ; __ __ __ __ __ __ __ 36 43 50 57 58 51 44 37 __
+ vmovdqa32 m5 {k1}, m0
+ mova [r0+2*64], m5
+ vpermt2d m3, m1, m4 ; __ __ 38 45 52 59 60 53 46 39 47 54 61 62 55 63
+ vpermd m2, m1, m2 ; 23 31 __ __ __ __ __ __ __ __ __ __ __ __ __ __
+ vmovdqa64 m2 {k2}, m3
+ mova [r0+3*64], m2
+ RET
+
+cglobal zigzag_scan_8x8_field, 2,2
+ mova m0, [scan_field_avx512]
+ mova m1, [r1+0*64]
+ mova m2, [r1+1*64]
+ mova m3, [r1+2*64]
+ mova m4, [r1+3*64]
+ mov r1d, 0x3f
+ kmovb k1, r1d
+ psrld m5, m0, 5
+ vpermi2d m0, m1, m2
+ vmovdqa64 m1 {k1}, m3 ; 32 33 34 35 36 37 38 39 40 41 42 43 12 13 14 15
+ vpermt2d m1, m5, m2
+ psrld m5, 5
+ vmovdqa64 m2 {k1}, m4 ; 48 49 50 51 52 53 54 55 56 57 58 59 28 29 30 31
+ vpermt2d m2, m5, m3
+ psrld m5, 5
+ vpermt2d m3, m5, m4
+ mova [r0+0*64], m0
+ mova [r0+1*64], m1
+ mova [r0+2*64], m2
+ mova [r0+3*64], m3
+ RET
+
+cglobal zigzag_interleave_8x8_cavlc, 3,3
+ mova m0, [cavlc_shuf_avx512]
+ mova m1, [r1+0*64]
+ mova m2, [r1+1*64]
+ mova m3, [r1+2*64]
+ mova m4, [r1+3*64]
+ kxnorb k1, k1, k1
+ por m7, m1, m2
+ psrld m5, m0, 5
+ vpermi2d m0, m1, m2 ; a0 a1 b0 b1
+ vpternlogd m7, m3, m4, 0xfe ; m1|m2|m3|m4
+ psrld m6, m5, 5
+ vpermi2d m5, m3, m4 ; b2 b3 a2 a3
+ vptestmd k0, m7, m7
+ vpermt2d m1, m6, m2 ; c0 c1 d0 d1
+ psrld m6, 5
+ vpermt2d m3, m6, m4 ; d2 d3 c2 c3
+ vshufi32x4 m2, m0, m5, q1032 ; b0 b1 b2 b3
+ vmovdqa32 m5 {k1}, m0 ; a0 a1 a2 a3
+ vshufi32x4 m4, m1, m3, q1032 ; d0 d1 d2 d3
+ vmovdqa32 m3 {k1}, m1 ; c0 c1 c2 c3
+ mova [r0+0*64], m5
+ mova [r0+1*64], m2
+ mova [r0+2*64], m3
+ mova [r0+3*64], m4
+ kmovw r1d, k0
+ test r1d, 0x1111
+ setnz [r2]
+ test r1d, 0x2222
+ setnz [r2+1]
+ test r1d, 0x4444
+ setnz [r2+8]
+ test r1d, 0x8888
+ setnz [r2+9]
+ RET
+
+%else ; !HIGH_BIT_DEPTH
+INIT_YMM avx512
+cglobal zigzag_scan_4x4_frame, 2,2
+ mova m0, [scan_frame_avx512]
+ vpermw m0, m0, [r1]
+ mova [r0], m0
+ RET
+
+cglobal zigzag_scan_4x4_field, 2,2
+ mova m0, [r1]
+ pshuflw xmm1, [r1+4], q3102
+ mova [r0], m0
+ movq [r0+4], xmm1
+ RET
+
+INIT_ZMM avx512
+cglobal zigzag_scan_8x8_frame, 2,2
+ psrlw m0, [scan_frame_avx512], 4
+scan8_avx512:
+ mova m1, [r1]
+ mova m2, [r1+64]
+ psrlw m3, m0, 6
+ vpermi2w m0, m1, m2
+ vpermt2w m1, m3, m2
+ mova [r0], m0
+ mova [r0+64], m1
+ RET
+
+cglobal zigzag_scan_8x8_field, 2,2
+ mova m0, [scan_field_avx512]
+ jmp scan8_avx512
+
+cglobal zigzag_interleave_8x8_cavlc, 3,3
+ mova m0, [cavlc_shuf_avx512]
+ mova m1, [r1]
+ mova m2, [r1+64]
+ psrlw m3, m0, 6
+ vpermi2w m0, m1, m2
+ vpermt2w m1, m3, m2
+ kxnorb k2, k2, k2
+ vptestmd k0, m0, m0
+ vptestmd k1, m1, m1
+ mova [r0], m0
+ mova [r0+64], m1
+ ktestw k2, k0
+ setnz [r2]
+ setnc [r2+1]
+ ktestw k2, k1
+ setnz [r2+8]
+ setnc [r2+9]
+ RET
+%endif ; !HIGH_BIT_DEPTH
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 67221c3..20a65c5 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -34,6 +34,7 @@ void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2
void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub4x4_dct_avx512 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
@@ -41,12 +42,16 @@ void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
-void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
-void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
+void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
+void x264_sub8x16_dct_dc_ssse3 ( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
+void x264_sub8x16_dct_dc_avx512( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
@@ -59,6 +64,7 @@ void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] );
void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] );
+void x264_add8x8_idct_avx512 ( uint8_t *p_dst, int16_t dct[ 4][16] );
void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] );
@@ -101,22 +107,26 @@ void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] );
void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] );
void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] );
-void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
-void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
-void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
-void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
-void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
-void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
-void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
-void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
-void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
+void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
+void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_avx512( dctcoef level[64], dctcoef dct[64] );
+void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
+void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
+void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
+void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] );
+void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
+void x264_zigzag_scan_4x4_field_avx512( dctcoef level[16], dctcoef dct[16] );
+void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
+void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
+void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_field_avx512( dctcoef level[64], dctcoef dct[64] );
int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
@@ -125,9 +135,10 @@ int x264_zigzag_sub_4x4_field_avx ( int16_t level[16], const uint8_t *src, u
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
-void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_sse2 ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx2 ( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_avx512( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index abebafa..917119b 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -28,10 +28,14 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA 32
-
-load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15
-insert_top_shuf: dd 0,1,4,5,7,2,3,6
+SECTION_RODATA 64
+
+load_bytes_zmm_shuf: dd 0x50404032, 0x70606053, 0xd0c0c0b4, 0xf0e0e0d5
+ dd 0x50404036, 0x70606057, 0xd0c0c0b8, 0xf0e0e0d9
+ dd 0x50104001, 0x70306023, 0xd090c083, 0xf0b0e0a5
+ dd 0x50104005, 0x70306027, 0xd090c087, 0xf0b0e0a9
+load_bytes_ymm_shuf: dd 0x06050403, 0x0e0d0c1b, 0x07060544, 0x0f0e0d5c
+ dd 0x06050473, 0x0e0d0c2b, 0x07060534, 0x0f0e0d6c
transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
SECTION .text
@@ -2276,13 +2280,10 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
RET
%endif ; !HIGH_BIT_DEPTH
-
-
;-----------------------------------------------------------------------------
; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
; uint8_t bs[2][4][4], int mvy_limit, int bframe )
;-----------------------------------------------------------------------------
-
%define scan8start (4+1*8)
%define nnz r0+scan8start
%define ref r1+scan8start
@@ -2290,145 +2291,54 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
%define bs0 r3
%define bs1 r3+32
-%macro LOAD_BYTES_MMX 1
- movd m2, [%1+8*0-1]
- movd m0, [%1+8*0]
- movd m3, [%1+8*2-1]
- movd m1, [%1+8*2]
- punpckldq m2, [%1+8*1-1]
- punpckldq m0, [%1+8*1]
- punpckldq m3, [%1+8*3-1]
- punpckldq m1, [%1+8*3]
-%endmacro
-
-%macro DEBLOCK_STRENGTH_REFS_MMX 0
- LOAD_BYTES_MMX ref
- pxor m2, m0
- pxor m3, m1
- por m2, [bs0+0]
- por m3, [bs0+8]
- movq [bs0+0], m2
- movq [bs0+8], m3
-
- movd m2, [ref-8*1]
- movd m3, [ref+8*1]
- punpckldq m2, m0 ; row -1, row 0
- punpckldq m3, m1 ; row 1, row 2
- pxor m0, m2
- pxor m1, m3
- por m0, [bs1+0]
- por m1, [bs1+8]
- movq [bs1+0], m0
- movq [bs1+8], m1
-%endmacro
-
-%macro DEBLOCK_STRENGTH_MVS_MMX 2
- mova m0, [mv-%2]
- mova m1, [mv-%2+8]
- psubw m0, [mv]
- psubw m1, [mv+8]
- packsswb m0, m1
- ABSB m0, m1
- psubusb m0, m7
- packsswb m0, m0
- por m0, [%1]
- movd [%1], m0
-%endmacro
-
-%macro DEBLOCK_STRENGTH_NNZ_MMX 1
- por m2, m0
- por m3, m1
- mova m4, [%1]
- mova m5, [%1+8]
- pminub m2, m6
- pminub m3, m6
- pminub m4, m6 ; mv ? 1 : 0
- pminub m5, m6
- paddb m2, m2 ; nnz ? 2 : 0
- paddb m3, m3
- pmaxub m2, m4
- pmaxub m3, m5
-%endmacro
-
-%macro LOAD_BYTES_XMM 1
- movu m2, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
+%macro LOAD_BYTES_XMM 2 ; src, aligned
+%if %2
+ mova m2, [%1-4]
+ mova m1, [%1+12]
+%else
+ movu m2, [%1-4]
movu m1, [%1+12]
- pslldq m0, m2, 1
+%endif
+ psllq m0, m2, 8
shufps m2, m1, q3131 ; cur nnz, all rows
- pslldq m1, 1
+ psllq m1, 8
shufps m0, m1, q3131 ; left neighbors
+%if cpuflag(avx) || (%2 && cpuflag(ssse3))
+ palignr m1, m2, [%1-20], 12
+%else
pslldq m1, m2, 4
- movd m3, [%1-8] ; could be palignr if nnz was aligned
+ movd m3, [%1-8]
por m1, m3 ; top neighbors
+%endif
%endmacro
-INIT_MMX mmx2
-cglobal deblock_strength, 6,6
- ; Prepare mv comparison register
- shl r4d, 8
- add r4d, 3 - (1<<8)
- movd m7, r4d
- SPLATW m7, m7
- mova m6, [pb_1]
- pxor m0, m0
- mova [bs0+0], m0
- mova [bs0+8], m0
- mova [bs1+0], m0
- mova [bs1+8], m0
-
-.lists:
- DEBLOCK_STRENGTH_REFS_MMX
- mov r4d, 4
-.mvs:
- DEBLOCK_STRENGTH_MVS_MMX bs0, 4
- DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8
- add r2, 4*8
- add r3, 4
- dec r4d
- jg .mvs
- add r1, 40
- add r2, 4*8
- sub r3, 16
- dec r5d
- jge .lists
-
- ; Check nnz
- LOAD_BYTES_MMX nnz
- DEBLOCK_STRENGTH_NNZ_MMX bs0
- ; Transpose column output
- SBUTTERFLY bw, 2, 3, 4
- SBUTTERFLY bw, 2, 3, 4
- mova [bs0+0], m2
- mova [bs0+8], m3
- movd m2, [nnz-8*1]
- movd m3, [nnz+8*1]
- punpckldq m2, m0 ; row -1, row 0
- punpckldq m3, m1 ; row 1, row 2
- DEBLOCK_STRENGTH_NNZ_MMX bs1
- mova [bs1+0], m2
- mova [bs1+8], m3
- RET
+%if UNIX64
+ DECLARE_REG_TMP 5
+%else
+ DECLARE_REG_TMP 4
+%endif
%macro DEBLOCK_STRENGTH_XMM 0
-cglobal deblock_strength, 6,6,7
+cglobal deblock_strength, 5,5,7
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
movd m6, r4d
+ movifnidn t0d, r5m
SPLATW m6, m6
pxor m4, m4 ; bs0
pxor m5, m5 ; bs1
.lists:
; Check refs
- LOAD_BYTES_XMM ref
+ LOAD_BYTES_XMM ref, 0
pxor m0, m2
pxor m1, m2
por m4, m0
por m5, m1
; Check mvs
-%if cpuflag(ssse3)
+%if cpuflag(ssse3) && notcpuflag(avx)
mova m0, [mv+4*8*0]
mova m1, [mv+4*8*1]
palignr m3, m0, [mv+4*8*0-16], 12
@@ -2481,11 +2391,11 @@ cglobal deblock_strength, 6,6,7
por m5, m0
add r1, 40
add r2, 4*8*5
- dec r5d
+ dec t0d
jge .lists
; Check nnz
- LOAD_BYTES_XMM nnz
+ LOAD_BYTES_XMM nnz, 1
por m0, m2
por m1, m2
mova m6, [pb_1]
@@ -2518,68 +2428,121 @@ INIT_XMM avx
DEBLOCK_STRENGTH_XMM
%macro LOAD_BYTES_YMM 1
- movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
- pshufb m0, [load_bytes_shuf] ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
- mova m2, [insert_top_shuf]
- vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2
- vpermd m0, m2, m0 ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS
- vpbroadcastd m2, [%1-8] ; ABCD ....
- vpblendd m0, m0, m2, 00010000b ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
+ movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
+ pshufb m0, m6 ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
+ vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2
+ vpbroadcastd m2, [%1-8] ; ABCD ....
+ vpblendd m0, m0, m2, 0x80
+ vpermd m0, m7, m0 ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
%endmacro
INIT_YMM avx2
-cglobal deblock_strength, 6,6,7
+cglobal deblock_strength, 5,5,8
+ mova m6, [load_bytes_ymm_shuf]
; Prepare mv comparison register
- shl r4d, 8
- add r4d, 3 - (1<<8)
- movd xm6, r4d
- vpbroadcastw m6, xm6
- pxor m5, m5 ; bs0,bs1
+ shl r4d, 8
+ add r4d, 3 - (1<<8)
+ movd xm5, r4d
+ movifnidn t0d, r5m
+ vpbroadcastw m5, xm5
+ psrld m7, m6, 4
+ pxor m4, m4 ; bs0,bs1
.lists:
; Check refs
LOAD_BYTES_YMM ref
- pxor m0, m1
- por m5, m0
+ pxor m0, m1
+ por m4, m0
; Check mvs
- movu xm0, [mv-4+4*8*0]
- vinserti128 m0, m0, [mv+4*8*-1], 1
- vbroadcasti128 m2, [mv+4*8* 0]
- vinserti128 m1, m2, [mv-4+4*8*1], 0
- vbroadcasti128 m3, [mv+4*8* 1]
- psubw m0, m2
- psubw m1, m3
-
- vinserti128 m2, m3, [mv-4+4*8*2], 0
- vbroadcasti128 m4, [mv+4*8* 2]
- vinserti128 m3, m4, [mv-4+4*8*3], 0
- psubw m2, m4
- vbroadcasti128 m4, [mv+4*8* 3]
- psubw m3, m4
- packsswb m0, m1
- packsswb m2, m3
- pabsb m0, m0
- pabsb m2, m2
- psubusb m0, m6
- psubusb m2, m6
- packsswb m0, m2
- por m5, m0
-
- add r1, 40
- add r2, 4*8*5
- dec r5d
+ movu xm0, [mv+0*4*8-4]
+ vinserti128 m0, m0, [mv-1*4*8 ], 1
+ vbroadcasti128 m2, [mv+0*4*8 ]
+ vinserti128 m1, m2, [mv+1*4*8-4], 0
+ psubw m0, m2
+ vbroadcasti128 m2, [mv+1*4*8 ]
+ psubw m1, m2
+ packsswb m0, m1
+ vinserti128 m1, m2, [mv+2*4*8-4], 0
+ vbroadcasti128 m3, [mv+2*4*8 ]
+ vinserti128 m2, m3, [mv+3*4*8-4], 0
+ psubw m1, m3
+ vbroadcasti128 m3, [mv+3*4*8 ]
+ psubw m2, m3
+ packsswb m1, m2
+ pabsb m0, m0
+ pabsb m1, m1
+ psubusb m0, m5
+ psubusb m1, m5
+ packsswb m0, m1
+ por m4, m0
+ add r1, 40
+ add r2, 4*8*5
+ dec t0d
jge .lists
; Check nnz
LOAD_BYTES_YMM nnz
- por m0, m1
- mova m6, [pb_1]
- pminub m0, m6
- pminub m5, m6 ; mv ? 1 : 0
- paddb m0, m0 ; nnz ? 2 : 0
- pmaxub m5, m0
- vextracti128 [bs1], m5, 1
- pshufb xm5, [transpose_shuf]
- mova [bs0], xm5
+ mova m2, [pb_1]
+ por m0, m1
+ pminub m0, m2
+ pminub m4, m2 ; mv ? 1 : 0
+ paddb m0, m0 ; nnz ? 2 : 0
+ pmaxub m0, m4
+ vextracti128 [bs1], m0, 1
+ pshufb xm0, [transpose_shuf]
+ mova [bs0], xm0
+ RET
+
+%macro LOAD_BYTES_ZMM 1
+ vpermd m1, m6, [%1-12]
+ pshufb m1, m7 ; EF FG GH HI JK KL LM MN OP PQ QR RS TU UV VW WX
+%endmacro ; AF BG CH DI FK GL HM IN KP LQ MR NS PU QV RW SX
+
+INIT_ZMM avx512
+cglobal deblock_strength, 5,5
+ mova m6, [load_bytes_zmm_shuf]
+ shl r4d, 8
+ add r4d, 3 - (1<<8)
+ vpbroadcastw m5, r4d
+ mov r4d, 0x34cc34cc ; {1,-1} * 11001100b
+ kmovb k1, r4d
+ vpbroadcastd m4, r4d
+ movifnidn t0d, r5m
+ psrld m7, m6, 4
+ pxor xm3, xm3
+
+.lists:
+ vbroadcasti64x2 m2, [mv+32]
+ vinserti64x2 m0, m2, [mv-32], 2
+ vbroadcasti64x2 m1, [mv+ 0]
+ vinserti64x2 m0, m0, [mv- 4], 0
+ vbroadcasti64x2 m1 {k1}, [mv+64]
+ vinserti64x2 m0, m0, [mv+60], 1
+ psubw m0, m1
+ vinserti64x2 m1, m1, [mv+28], 0
+ vbroadcasti64x2 m2 {k1}, [mv+96]
+ vinserti64x2 m1, m1, [mv+92], 1
+ psubw m1, m2
+ packsswb m0, m1
+ pabsb m0, m0
+ psubusb m0, m5
+
+ LOAD_BYTES_ZMM ref
+ pmaddubsw m1, m4 ; E-F F-G G-H H-I ...
+ vpternlogd m3, m0, m1, 0xfe ; m3 | m0 | m1
+ add r1, 40
+ add r2, 4*8*5
+ dec t0d
+ jge .lists
+
+ LOAD_BYTES_ZMM nnz
+ mova ym2, [pb_1]
+ vptestmw k1, m1, m1
+ vptestmw k2, m3, m3
+ vpaddb ym0 {k1}{z}, ym2, ym2 ; nnz ? 2 : 0
+ vpmaxub ym0 {k2}, ym2 ; mv ? 1 : 0
+ vextracti128 [bs1], ym0, 1
+ pshufb xm0, [transpose_shuf]
+ mova [bs0], xm0
RET
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 92029ad..3c1d214 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -83,11 +83,11 @@ cextern deinterleave_shufd
%endmacro
%endif
-%macro AVG_END 0
- lea t4, [t4+t5*2*SIZEOF_PIXEL]
+%macro AVG_END 0-1 2 ; rows
lea t2, [t2+t3*2*SIZEOF_PIXEL]
+ lea t4, [t4+t5*2*SIZEOF_PIXEL]
lea t0, [t0+t1*2*SIZEOF_PIXEL]
- sub eax, 2
+ sub eax, %1
jg .height_loop
RET
%endmacro
@@ -147,17 +147,24 @@ cextern deinterleave_shufd
%endmacro
%macro BIWEIGHT_START_SSSE3 0
- movzx t6d, byte r6m ; FIXME x86_64
- mov t7d, 64
- sub t7d, t6d
- shl t7d, 8
- add t6d, t7d
- mova m4, [pw_512]
- movd xm3, t6d
+ movzx t6d, byte r6m ; FIXME x86_64
+%if mmsize > 16
+ vbroadcasti128 m4, [pw_512]
+%else
+ mova m4, [pw_512]
+%endif
+ lea t7d, [t6+(64<<8)]
+ shl t6d, 8
+ sub t7d, t6d
+%if cpuflag(avx512)
+ vpbroadcastw m3, t7d
+%else
+ movd xm3, t7d
%if cpuflag(avx2)
- vpbroadcastw m3, xm3
+ vpbroadcastw m3, xm3
%else
- SPLATW m3, m3 ; weight_dst,src
+ SPLATW m3, m3 ; weight_dst,src
+%endif
%endif
%endmacro
@@ -268,6 +275,66 @@ cglobal pixel_avg_weight_w16
mova [t0], xm0
vextracti128 [t0+t1], m0, 1
AVG_END
+
+INIT_YMM avx512
+cglobal pixel_avg_weight_w8
+ BIWEIGHT_START
+ kxnorb k1, k1, k1
+ kaddb k1, k1, k1
+ AVG_START 5
+.height_loop:
+ movq xm0, [t2]
+ movq xm2, [t4]
+ movq xm1, [t2+t3]
+ movq xm5, [t4+t5]
+ lea t2, [t2+t3*2]
+ lea t4, [t4+t5*2]
+ vpbroadcastq m0 {k1}, [t2]
+ vpbroadcastq m2 {k1}, [t4]
+ vpbroadcastq m1 {k1}, [t2+t3]
+ vpbroadcastq m5 {k1}, [t4+t5]
+ punpcklbw m0, m2
+ punpcklbw m1, m5
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+ vextracti128 xmm1, m0, 1
+ movq [t0], xm0
+ movhps [t0+t1], xm0
+ lea t0, [t0+t1*2]
+ movq [t0], xmm1
+ movhps [t0+t1], xmm1
+ AVG_END 4
+
+INIT_ZMM avx512
+cglobal pixel_avg_weight_w16
+ BIWEIGHT_START
+ AVG_START 5
+.height_loop:
+ movu xm0, [t2]
+ movu xm1, [t4]
+ vinserti128 ym0, [t2+t3], 1
+ vinserti128 ym1, [t4+t5], 1
+ lea t2, [t2+t3*2]
+ lea t4, [t4+t5*2]
+ vinserti32x4 m0, [t2], 2
+ vinserti32x4 m1, [t4], 2
+ vinserti32x4 m0, [t2+t3], 3
+ vinserti32x4 m1, [t4+t5], 3
+ SBUTTERFLY bw, 0, 1, 2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+ mova [t0], xm0
+ vextracti128 [t0+t1], ym0, 1
+ lea t0, [t0+t1*2]
+ vextracti32x4 [t0], m0, 2
+ vextracti32x4 [t0+t1], m0, 3
+ AVG_END 4
%endif ;HIGH_BIT_DEPTH
;=============================================================================
@@ -738,6 +805,12 @@ INIT_XMM avx2
AVG_FUNC 16, movdqu, movdqa
AVGH 16, 16
AVGH 16, 8
+INIT_XMM avx512
+AVGH 16, 16
+AVGH 16, 8
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
%endif ;HIGH_BIT_DEPTH
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 2e72b61..e93cfcc 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -30,18 +30,15 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA 32
-
-pw_1024: times 16 dw 1024
-filt_mul20: times 32 db 20
-filt_mul15: times 16 db 1, -5
-filt_mul51: times 16 db -5, 1
-hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+SECTION_RODATA 64
%if HIGH_BIT_DEPTH
-v210_mask: times 4 dq 0xc00ffc003ff003ff
-v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
-v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
+v210_shuf_avx512: db 0, 0,34, 1,35,34, 4, 4,38, 5,39,38, 8, 8,42, 9, ; luma, chroma
+ db 43,42,12,12,46,13,47,46,16,16,50,17,51,50,20,20,
+ db 54,21,55,54,24,24,58,25,59,58,28,28,62,29,63,62
+v210_mask: dd 0x3ff003ff, 0xc00ffc00, 0x3ff003ff, 0xc00ffc00
+v210_luma_shuf: db 1, 2, 4, 5, 6, 7, 9,10,12,13,14,15,12,13,14,15
+v210_chroma_shuf: db 0, 1, 2, 3, 5, 6, 8, 9,10,11,13,14,10,11,13,14
; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
@@ -58,6 +55,13 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
%endif ; !HIGH_BIT_DEPTH
+pw_1024: times 16 dw 1024
+filt_mul20: times 32 db 20
+filt_mul15: times 16 db 1, -5
+filt_mul51: times 16 db -5, 1
+hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+
+mbtree_prop_list_avx512_shuf: dw 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7
mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
@@ -1044,8 +1048,8 @@ PLANE_COPY_CORE 1
%endif ; HIGH_BIT_DEPTH
%endmacro
-%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
- mova m0, [%3]
+%macro DEINTERLEAVE 6 ; dsta, dstb, src, dsta==dstb+8, shuffle constant, is aligned
+ mov%6 m0, [%3]
%if mmsize == 32
pshufb m0, %5
vpermq m0, m0, q3120
@@ -1056,7 +1060,7 @@ PLANE_COPY_CORE 1
vextracti128 [%2], m0, 1
%endif
%elif HIGH_BIT_DEPTH
- mova m1, [%3+mmsize]
+ mov%6 m1, [%3+mmsize]
psrld m2, m0, 16
psrld m3, m1, 16
pand m0, %5
@@ -1181,8 +1185,8 @@ cglobal store_interleave_chroma, 5,5
%macro PLANE_DEINTERLEAVE 0
;-----------------------------------------------------------------------------
-; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
-; pixel *dstv, intptr_t i_dstv,
+; void plane_copy_deinterleave( pixel *dsta, intptr_t i_dsta,
+; pixel *dstb, intptr_t i_dstb,
; pixel *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
%if ARCH_X86_64
@@ -1400,43 +1404,64 @@ cglobal plane_copy_deinterleave_v210, 7,7,7
%define org_w r6m
%define h dword r7m
%endif
- FIX_STRIDES r1, r3, r6d
- shl r5, 2
- add r0, r6
- add r2, r6
- neg r6
- mov src, r4
- mov org_w, r6
- mova m2, [v210_mask]
- mova m3, [v210_luma_shuf]
- mova m4, [v210_chroma_shuf]
- mova m5, [v210_mult] ; also functions as vpermd index for avx2
- pshufd m6, m5, q1102
-
+ FIX_STRIDES r1, r3, r6d
+ shl r5, 2
+ add r0, r6
+ add r2, r6
+ neg r6
+ mov src, r4
+ mov org_w, r6
+%if cpuflag(avx512)
+ vpbroadcastd m2, [v210_mask]
+ vpbroadcastd m3, [v210_shuf_avx512]
+ psrlw m3, 6 ; dw 0, 4
+ mova m4, [v210_shuf_avx512] ; luma
+ psrlw m5, m4, 8 ; chroma
+%else
+%if mmsize == 32
+ vbroadcasti128 m2, [v210_mask]
+ vbroadcasti128 m3, [v210_luma_shuf]
+ vbroadcasti128 m4, [v210_chroma_shuf]
+%else
+ mova m2, [v210_mask]
+ mova m3, [v210_luma_shuf]
+ mova m4, [v210_chroma_shuf]
+%endif
+ mova m5, [v210_mult] ; also functions as vpermd index for avx2
+ pshufd m6, m5, q1102
+%endif
ALIGN 16
.loop:
- movu m1, [r4]
- pandn m0, m2, m1
- pand m1, m2
- pshufb m0, m3
- pshufb m1, m4
- pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
- pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
+ movu m1, [r4]
+ pandn m0, m2, m1
+ pand m1, m2
+%if cpuflag(avx512)
+ psrld m0, 10
+ vpsrlvw m1, m3
+ mova m6, m0
+ vpermt2w m0, m4, m1
+ vpermt2w m1, m5, m6
+%else
+ pshufb m0, m3
+ pshufb m1, m4
+ pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
+ pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
%if mmsize == 32
- vpermd m0, m5, m0
- vpermd m1, m5, m1
+ vpermd m0, m5, m0
+ vpermd m1, m5, m1
+%endif
%endif
- movu [r0+r6], m0
- movu [r2+r6], m1
- add r4, mmsize
- add r6, 3*mmsize/4
+ movu [r0+r6], m0
+ movu [r2+r6], m1
+ add r4, mmsize
+ add r6, mmsize*3/4
jl .loop
- add r0, r1
- add r2, r3
- add src, r5
- mov r4, src
- mov r6, org_w
- dec h
+ add r0, r1
+ add r2, r3
+ add src, r5
+ mov r4, src
+ mov r6, org_w
+ dec h
jg .loop
RET
%endmacro ; PLANE_DEINTERLEAVE_V210
@@ -1461,6 +1486,8 @@ PLANE_DEINTERLEAVE_V210
INIT_YMM avx2
LOAD_DEINTERLEAVE_CHROMA
PLANE_DEINTERLEAVE_V210
+INIT_ZMM avx512
+PLANE_DEINTERLEAVE_V210
%else
INIT_XMM sse2
PLANE_DEINTERLEAVE_RGB
@@ -1473,82 +1500,85 @@ LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
PLANE_DEINTERLEAVE_RGB
%endif
-; These functions are not general-use; not only do the SSE ones require aligned input,
-; but they also will fail if given a non-mod16 size.
-; memzero SSE will fail for non-mod128.
+; These functions are not general-use; not only do they require aligned input, but memcpy
+; requires size to be a multiple of 16 and memzero requires size to be a multiple of 128.
;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
%macro MEMCPY 0
cglobal memcpy_aligned, 3,3
-%if mmsize == 16
+%if mmsize == 32
test r2d, 16
- jz .copy2
- mova m0, [r1+r2-16]
- mova [r0+r2-16], m0
+ jz .copy32
+ mova xm0, [r1+r2-16]
+ mova [r0+r2-16], xm0
sub r2d, 16
-.copy2:
-%endif
- test r2d, 2*mmsize
- jz .copy4start
+ jle .ret
+.copy32:
+%endif
+ test r2d, mmsize
+ jz .loop
+ mova m0, [r1+r2-mmsize]
+ mova [r0+r2-mmsize], m0
+ sub r2d, mmsize
+ jle .ret
+.loop:
mova m0, [r1+r2-1*mmsize]
mova m1, [r1+r2-2*mmsize]
mova [r0+r2-1*mmsize], m0
mova [r0+r2-2*mmsize], m1
sub r2d, 2*mmsize
-.copy4start:
- test r2d, r2d
- jz .ret
-.copy4:
- mova m0, [r1+r2-1*mmsize]
- mova m1, [r1+r2-2*mmsize]
- mova m2, [r1+r2-3*mmsize]
- mova m3, [r1+r2-4*mmsize]
- mova [r0+r2-1*mmsize], m0
- mova [r0+r2-2*mmsize], m1
- mova [r0+r2-3*mmsize], m2
- mova [r0+r2-4*mmsize], m3
- sub r2d, 4*mmsize
- jg .copy4
+ jg .loop
.ret:
- REP_RET
+ RET
%endmacro
-INIT_MMX mmx
-MEMCPY
-INIT_XMM sse
-MEMCPY
-
;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
-%macro MEMZERO 1
+%macro MEMZERO 0
cglobal memzero_aligned, 2,2
- add r0, r1
- neg r1
-%if mmsize == 8
- pxor m0, m0
-%else
xorps m0, m0
-%endif
.loop:
-%assign i 0
-%rep %1
- mova [r0 + r1 + i], m0
-%assign i i+mmsize
+%assign %%i mmsize
+%rep 128 / mmsize
+ movaps [r0 + r1 - %%i], m0
+%assign %%i %%i+mmsize
%endrep
- add r1, mmsize*%1
- jl .loop
+ sub r1d, 128
+ jg .loop
RET
%endmacro
-INIT_MMX mmx
-MEMZERO 8
INIT_XMM sse
-MEMZERO 8
+MEMCPY
+MEMZERO
INIT_YMM avx
-MEMZERO 4
+MEMCPY
+MEMZERO
+INIT_ZMM avx512
+MEMZERO
+
+cglobal memcpy_aligned, 3,4
+ dec r2d ; offset of the last byte
+ rorx r3d, r2d, 2
+ and r2d, ~63
+ and r3d, 15 ; n = number of dwords minus one to copy in the tail
+ mova m0, [r1+r2]
+ not r3d ; bits 0-4: (n^15)+16, bits 16-31: 0xffff
+ shrx r3d, r3d, r3d ; 0xffff >> (n^15)
+ kmovw k1, r3d ; (1 << (n+1)) - 1
+ vmovdqa32 [r0+r2] {k1}, m0
+ sub r2d, 64
+ jl .ret
+.loop:
+ mova m0, [r1+r2]
+ mova [r0+r2], m0
+ sub r2d, 64
+ jge .loop
+.ret:
+ RET
%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
@@ -2147,13 +2177,13 @@ MBTREE
cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
vbroadcastss m5, [r5]
mov r5d, r6m
- lea r0, [r0+r5*2]
+ lea r2, [r2+r5*2]
add r5d, r5d
- add r1, r5
- add r2, r5
- add r3, r5
add r4, r5
neg r5
+ sub r1, r5
+ sub r3, r5
+ sub r0, r5
mova xm4, [pw_3fff]
%if notcpuflag(avx2)
pxor xm7, xm7
@@ -2165,9 +2195,8 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
pmovzxwd m2, [r1+r5] ; prop
pand xm3, xm4, [r3+r5] ; inter
pmovzxwd m3, xm3
- pminsd m3, m0
pmaddwd m1, m0
- psubd m3, m0, m3
+ psubusw m3, m0, m3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
@@ -2184,7 +2213,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
movu xm1, [r4+r5]
movu xm2, [r1+r5]
pand xm3, xm4, [r3+r5]
- pminsw xm3, xm0
+ psubusw xm3, xm0, xm3
INT16_UNPACK 0
INT16_UNPACK 1
INT16_UNPACK 2
@@ -2194,7 +2223,6 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m1, m0
- subps m3, m0, m3
mulps m1, m5 ; intra*invq*fps_factor>>8
addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
rcpps m2, m0 ; 1 / intra 1st approximation
@@ -2205,7 +2233,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
subps m2, m0 ; 2nd approximation for 1/intra
mulps m1, m2 ; / intra
%endif
- vcvtps2dq m1, m1
+ cvtps2dq m1, m1
vextractf128 xm2, m1, 1
packssdw xm1, xm2
mova [r0+r5], xm1
@@ -2219,6 +2247,39 @@ MBTREE_AVX
INIT_YMM avx2
MBTREE_AVX
+INIT_ZMM avx512
+cglobal mbtree_propagate_cost, 6,6
+ vbroadcastss m5, [r5]
+ mov r5d, 0x3fff3fff
+ vpbroadcastd ym4, r5d
+ mov r5d, r6m
+ lea r2, [r2+r5*2]
+ add r5d, r5d
+ add r1, r5
+ neg r5
+ sub r4, r5
+ sub r3, r5
+ sub r0, r5
+.loop:
+ pmovzxwd m0, [r2+r5] ; intra
+ pmovzxwd m1, [r1+r5] ; prop
+ pmovzxwd m2, [r4+r5] ; invq
+ pand ym3, ym4, [r3+r5] ; inter
+ pmovzxwd m3, ym3
+ psubusw m3, m0, m3
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ cvtdq2ps m2, m2
+ cvtdq2ps m3, m3
+ vdivps m1, m0, {rn-sae}
+ fmaddps m1, m2, m5, m1
+ mulps m1, m3
+ cvtps2dq m1, m1
+ vpmovsdw [r0+r5], m1
+ add r5, 32
+ jl .loop
+ RET
+
%macro MBTREE_PROPAGATE_LIST 0
;-----------------------------------------------------------------------------
; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
@@ -2372,6 +2433,112 @@ cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8
jl .loop
RET
+%if ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount,
+; uint16_t *lowres_costs, int bipred_weight, int mb_y,
+; int width, int height, int stride, int list_mask );
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal mbtree_propagate_list_internal, 5,7,21
+ mova xm16, [pw_0xc000]
+ vpbroadcastw xm17, r5m ; bipred_weight << 9
+ vpbroadcastw ym18, r10m ; 1 << (list+LOWRES_COST_SHIFT)
+ vbroadcasti32x8 m5, [mbtree_prop_list_avx512_shuf]
+ vbroadcasti32x8 m6, [pd_0123]
+ vpord m6, r6m {1to16} ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y
+ vbroadcasti128 m7, [pd_8]
+ vbroadcasti128 m8, [pw_31]
+ vbroadcasti128 m9, [pw_32]
+ psllw m10, m9, 4
+ pcmpeqw ym19, ym19 ; pw_m1
+ vpbroadcastw ym20, r7m ; width
+ psrld m11, m7, 3 ; pd_1
+ psrld m12, m8, 16 ; pd_31
+ vpbroadcastd m13, r8m ; height
+ vpbroadcastd m14, r9m ; stride
+ pslld m15, m14, 16
+ por m15, m11 ; {1, stride, 1, stride} ...
+ lea r4, [r4+2*r0] ; lowres_costs
+ lea r3, [r3+2*r0] ; propagate_amount
+ lea r2, [r2+4*r0] ; mvs
+ neg r0
+ mov r6d, 0x5555ffff
+ kmovd k4, r6d
+ kshiftrd k5, k4, 16 ; 0x5555
+ kshiftlw k6, k4, 8 ; 0xff00
+.loop:
+ vbroadcasti128 ym1, [r4+2*r0]
+ mova xm4, [r3+2*r0]
+ vpcmpuw k1, xm1, xm16, 5 ; if (lists_used == 3)
+ vpmulhrsw xm4 {k1}, xm17 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+ vptestmw k1, ym1, ym18
+ vpermw m4, m5, m4
+
+ vbroadcasti32x8 m3, [r2+4*r0] ; {mvx, mvy}
+ psraw m0, m3, 5
+ paddw m0, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y}
+ paddd m6, m7 ; i_mb_x += 8
+ pand m3, m8 ; {x, y}
+ vprold m1, m3, 20 ; {y, x} << 4
+ psubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y}
+ psubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4
+ pmullw m3, m1
+ paddsw m3, m3 ; prevent signed overflow in idx0 (32*32<<5 == 0x8000)
+ pmulhrsw m2, m3, m4 ; idx01weight idx23weightp
+
+ pslld ym1, ym0, 16
+ psubw ym1, ym19
+ vmovdqu16 ym1 {k5}, ym0
+ vpcmpuw k2, ym1, ym20, 1 ; {mbx, mbx+1} < width
+ kunpckwd k2, k2, k2
+ psrad m1, m0, 16
+ paddd m1 {k6}, m11
+ vpcmpud k1 {k1}, m1, m13, 1 ; mby < height | mby+1 < height
+
+ pmaddwd m0, m15
+ paddd m0 {k6}, m14 ; idx0 | idx2
+ vmovdqu16 m2 {k2}{z}, m2 ; idx01weight | idx23weight
+ vptestmd k1 {k1}, m2, m2 ; mask out offsets with no changes
+
+ ; We're handling dwords, but the offsets are in words so there may be partial overlaps.
+ ; We can work around this by handling dword-aligned and -unaligned offsets separately.
+ vptestmd k0, m0, m11
+ kandnw k2, k0, k1 ; dword-aligned offsets
+ kmovw k3, k2
+ vpgatherdd m3 {k2}, [r1+2*m0]
+
+ ; If there are conflicts in the offsets we have to handle them before storing the results.
+ ; By creating a permutation index using vplzcntd we can resolve all conflicts in parallel
+ ; in ceil(log2(n)) iterations where n is the largest number of duplicate offsets.
+ vpconflictd m4, m0
+ vpbroadcastmw2d m1, k1
+ vptestmd k2, m1, m4
+ ktestw k2, k2
+ jz .no_conflicts
+ pand m1, m4 ; mask away unused offsets to avoid false positives
+ vplzcntd m1, m1
+ pxor m1, m12 ; lzcnt gives us the distance from the msb, we want it from the lsb
+.conflict_loop:
+ vpermd m4 {k2}{z}, m1, m2
+ vpermd m1 {k2}, m1, m1 ; shift the index one step forward
+ paddsw m2, m4 ; add the weights of conflicting offsets
+ vpcmpd k2, m1, m12, 2
+ ktestw k2, k2
+ jnz .conflict_loop
+.no_conflicts:
+ paddsw m3, m2
+ vpscatterdd [r1+2*m0] {k3}, m3
+ kandw k1, k0, k1 ; dword-unaligned offsets
+ kmovw k2, k1
+ vpgatherdd m1 {k1}, [r1+2*m0]
+ paddsw m1, m2 ; all conflicts have already been resolved
+ vpscatterdd [r1+2*m0] {k2}, m1
+ add r0, 8
+ jl .loop
+ RET
+%endif
+
%macro MBTREE_FIX8 0
;-----------------------------------------------------------------------------
; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index f5e8be2..c06691c 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -32,7 +32,8 @@
void func##_mmx2 args;\
void func##_sse2 args;\
void func##_ssse3 args;\
- void func##_avx2 args;
+ void func##_avx2 args;\
+ void func##_avx512 args;
DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
@@ -99,17 +100,17 @@ void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst,
void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
-void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu,
- pixel *dstv, intptr_t i_dstv,
+void x264_plane_copy_deinterleave_sse2( pixel *dsta, intptr_t i_dsta,
+ pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
- uint8_t *dstv, intptr_t i_dstv,
+void x264_plane_copy_deinterleave_ssse3( uint8_t *dsta, intptr_t i_dsta,
+ uint8_t *dstb, intptr_t i_dstb,
uint8_t *src, intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
- uint16_t *dstv, intptr_t i_dstv,
+void x264_plane_copy_deinterleave_avx( uint16_t *dsta, intptr_t i_dsta,
+ uint16_t *dstb, intptr_t i_dstb,
uint16_t *src, intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_avx2( pixel *dstu, intptr_t i_dstu,
- pixel *dstv, intptr_t i_dstv,
+void x264_plane_copy_deinterleave_avx2( pixel *dsta, intptr_t i_dsta,
+ pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
@@ -123,15 +124,18 @@ void x264_plane_copy_deinterleave_rgb_avx2 ( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
-void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
- uint16_t *dstv, intptr_t i_dstv,
- uint32_t *src, intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
- uint16_t *dstv, intptr_t i_dstv,
- uint32_t *src, intptr_t i_src, int w, int h );
-void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
- uint16_t *dstv, intptr_t i_dstv,
- uint32_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_ssse3 ( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx512( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
@@ -143,11 +147,12 @@ void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
-void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
-void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
-void x264_memzero_aligned_mmx( void *dst, size_t n );
-void x264_memzero_aligned_sse( void *dst, size_t n );
-void x264_memzero_aligned_avx( void *dst, size_t n );
+void *x264_memcpy_aligned_sse ( void *dst, const void *src, size_t n );
+void *x264_memcpy_aligned_avx ( void *dst, const void *src, size_t n );
+void *x264_memcpy_aligned_avx512( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_sse ( void *dst, size_t n );
+void x264_memzero_aligned_avx ( void *dst, size_t n );
+void x264_memzero_aligned_avx512( void *dst, size_t n );
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
@@ -160,14 +165,16 @@ void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
-void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count );
@@ -498,6 +505,15 @@ PLANE_COPY(32, avx)
PLANE_COPY_SWAP(16, ssse3)
PLANE_COPY_SWAP(32, avx2)
+#if HIGH_BIT_DEPTH
+PLANE_COPY_YUYV(64, sse2)
+PLANE_COPY_YUYV(64, avx)
+#else
+PLANE_COPY_YUYV(32, sse2)
+PLANE_COPY_YUYV(32, ssse3)
+#endif
+PLANE_COPY_YUYV(64, avx2)
+
PLANE_INTERLEAVE(mmx2)
PLANE_INTERLEAVE(sse2)
#if HIGH_BIT_DEPTH
@@ -538,6 +554,21 @@ PROPAGATE_LIST(ssse3)
PROPAGATE_LIST(avx)
PROPAGATE_LIST(avx2)
+#if ARCH_X86_64
+void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount,
+ uint16_t *lowres_costs, int bipred_weight, int mb_y,
+ int width, int height, int stride, int list_mask );
+
+static void x264_mbtree_propagate_list_avx512( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
+ int16_t *propagate_amount, uint16_t *lowres_costs,
+ int bipred_weight, int mb_y, int len, int list )
+{
+ x264_mbtree_propagate_list_internal_avx512( len, ref_costs, mvs, propagate_amount, lowres_costs, bipred_weight << 9,
+ mb_y << 16, h->mb.i_mb_width, h->mb.i_mb_height, h->mb.i_mb_stride,
+ (1 << LOWRES_COST_SHIFT) << list );
+}
+#endif
+
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_MMX) )
@@ -547,8 +578,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
- pf->memcpy_aligned = x264_memcpy_aligned_mmx;
- pf->memzero_aligned = x264_memzero_aligned_mmx;
pf->integral_init4v = x264_integral_init4v_mmx;
pf->integral_init8v = x264_integral_init8v_mmx;
@@ -606,6 +635,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
+ pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_sse2;
if( cpu&X264_CPU_SSE2_IS_FAST )
{
@@ -661,6 +691,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
+ pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_avx;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
pf->store_interleave_chroma = x264_store_interleave_chroma_avx;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx;
@@ -677,6 +708,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
}
+
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx512;
+ }
#else // !HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
@@ -702,6 +738,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->hpel_filter = x264_hpel_filter_sse2_amd;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
+ pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_sse2;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2;
@@ -763,6 +800,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
+ pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_ssse3;
}
if( !(cpu&X264_CPU_SLOW_PALIGNR) )
@@ -828,10 +866,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
}
+
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx512;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512;
+ }
#endif // HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_AVX) )
return;
+ pf->memcpy_aligned = x264_memcpy_aligned_avx;
pf->memzero_aligned = x264_memzero_aligned_avx;
pf->plane_copy = x264_plane_copy_avx;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
@@ -844,10 +892,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->plane_copy_swap = x264_plane_copy_swap_avx2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
+ pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_avx2;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
pf->get_ref = get_ref_avx2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
+
+ if( !(cpu&X264_CPU_AVX512) )
+ return;
+ pf->memcpy_aligned = x264_memcpy_aligned_avx512;
+ pf->memzero_aligned = x264_memzero_aligned_avx512;
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
+#if ARCH_X86_64
+ pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx512;
+#endif
}
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 42e0dd0..1ce26b9 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -32,6 +32,8 @@
%include "x86util.asm"
SECTION_RODATA 32
+var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
+ db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
hmul_16p: times 16 db 1
times 8 db 1, -1
hmul_8p: times 8 db 1
@@ -701,25 +703,32 @@ SSD_NV12
%if HIGH_BIT_DEPTH == 0
%if %1
mova m7, [pw_00ff]
-%elif mmsize < 32
+%elif mmsize == 16
pxor m7, m7 ; zero
%endif
%endif ; !HIGH_BIT_DEPTH
%endmacro
-%macro VAR_END 2
-%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256
- HADDUW m5, m2
-%else
- HADDW m5, m2
+%macro VAR_END 0
+ pmaddwd m5, [pw_1]
+ SBUTTERFLY dq, 5, 6, 0
+ paddd m5, m6
+%if mmsize == 32
+ vextracti128 xm6, m5, 1
+ paddd xm5, xm6
%endif
- HADDD m6, m1
+ MOVHL xm6, xm5
+ paddd xm5, xm6
%if ARCH_X86_64
- punpckldq m5, m6
- movq rax, m5
+ movq rax, xm5
+%else
+ movd eax, xm5
+%if cpuflag(avx)
+ pextrd edx, xm5, 1
%else
- movd eax, m5
- movd edx, m6
+ pshuflw xm5, xm5, q1032
+ movd edx, xm5
+%endif
%endif
RET
%endmacro
@@ -739,61 +748,25 @@ SSD_NV12
paddd m6, m4
%endmacro
-%macro VAR_2ROW 2
- mov r2d, %2
-.loop:
-%if HIGH_BIT_DEPTH
- mova m0, [r0]
- mova m1, [r0+mmsize]
- mova m3, [r0+%1]
- mova m4, [r0+%1+mmsize]
-%else ; !HIGH_BIT_DEPTH
- mova m0, [r0]
- mova m3, [r0+%1]
- punpckhbw m1, m0, m7
- punpcklbw m0, m7
- punpckhbw m4, m3, m7
- punpcklbw m3, m7
-%endif ; HIGH_BIT_DEPTH
-%ifidn %1, r1
- lea r0, [r0+%1*2]
-%else
- add r0, r1
-%endif
- VAR_CORE
- dec r2d
- jg .loop
-%endmacro
-
;-----------------------------------------------------------------------------
; int pixel_var_wxh( uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_var_16x16, 2,3
- FIX_STRIDES r1
- VAR_START 0
- VAR_2ROW 8*SIZEOF_PIXEL, 16
- VAR_END 16, 16
-
-cglobal pixel_var_8x16, 2,3
- FIX_STRIDES r1
- VAR_START 0
- VAR_2ROW r1, 8
- VAR_END 8, 16
-
-cglobal pixel_var_8x8, 2,3
- FIX_STRIDES r1
- VAR_START 0
- VAR_2ROW r1, 4
- VAR_END 8, 8
-
%if HIGH_BIT_DEPTH
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
FIX_STRIDES r1
VAR_START 0
- VAR_2ROW r1, 8
- VAR_END 16, 16
+ mov r2d, 8
+.loop:
+ mova m0, [r0]
+ mova m1, [r0+mmsize]
+ mova m3, [r0+r1]
+ mova m4, [r0+r1+mmsize]
+ lea r0, [r0+r1*2]
+ VAR_CORE
+ dec r2d
+ jg .loop
+ VAR_END
cglobal pixel_var_8x8, 2,3,8
lea r2, [r1*3]
@@ -809,18 +782,16 @@ cglobal pixel_var_8x8, 2,3,8
mova m3, [r0+r1*4]
mova m4, [r0+r2*2]
VAR_CORE
- VAR_END 8, 8
+ VAR_END
%endmacro ; VAR
INIT_XMM sse2
VAR
INIT_XMM avx
VAR
-INIT_XMM xop
-VAR
-%endif ; HIGH_BIT_DEPTH
-%if HIGH_BIT_DEPTH == 0
+%else ; HIGH_BIT_DEPTH == 0
+
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
VAR_START 1
@@ -833,7 +804,7 @@ cglobal pixel_var_16x16, 2,3,8
VAR_CORE
dec r2d
jg .loop
- VAR_END 16, 16
+ VAR_END
cglobal pixel_var_8x8, 2,4,8
VAR_START 1
@@ -849,7 +820,7 @@ cglobal pixel_var_8x8, 2,4,8
VAR_CORE
dec r2d
jg .loop
- VAR_END 8, 8
+ VAR_END
cglobal pixel_var_8x16, 2,4,8
VAR_START 1
@@ -865,15 +836,13 @@ cglobal pixel_var_8x16, 2,4,8
VAR_CORE
dec r2d
jg .loop
- VAR_END 8, 16
+ VAR_END
%endmacro ; VAR
INIT_XMM sse2
VAR
INIT_XMM avx
VAR
-INIT_XMM xop
-VAR
%endif ; !HIGH_BIT_DEPTH
INIT_YMM avx2
@@ -898,209 +867,357 @@ cglobal pixel_var_16x16, 2,4,7
VAR_CORE
dec r2d
jg .loop
- vextracti128 xm0, m5, 1
- vextracti128 xm1, m6, 1
- paddw xm5, xm0
- paddd xm6, xm1
- HADDW xm5, xm2
- HADDD xm6, xm1
-%if ARCH_X86_64
- punpckldq xm5, xm6
- movq rax, xm5
+ VAR_END
+
+%macro VAR_AVX512_CORE 1 ; accum
+%if %1
+ paddw m0, m2
+ pmaddwd m2, m2
+ paddw m0, m3
+ pmaddwd m3, m3
+ paddd m1, m2
+ paddd m1, m3
%else
- movd eax, xm5
- movd edx, xm6
+ paddw m0, m2, m3
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m1, m2, m3
%endif
- RET
+%endmacro
-%macro VAR2_END 3
- HADDW %2, xm1
- movd r1d, %2
- imul r1d, r1d
- HADDD %3, xm1
- shr r1d, %1
- movd eax, %3
- movd [r4], %3
- sub eax, r1d ; sqr - (sum * sum >> shift)
- RET
+%macro VAR_AVX512_CORE_16x16 1 ; accum
+%if HIGH_BIT_DEPTH
+ mova ym2, [r0]
+ vinserti64x4 m2, [r0+r1], 1
+ mova ym3, [r0+2*r1]
+ vinserti64x4 m3, [r0+r3], 1
+%else
+ vbroadcasti64x2 ym2, [r0]
+ vbroadcasti64x2 m2 {k1}, [r0+r1]
+ vbroadcasti64x2 ym3, [r0+2*r1]
+ vbroadcasti64x2 m3 {k1}, [r0+r3]
+ pshufb m2, m4
+ pshufb m3, m4
+%endif
+ VAR_AVX512_CORE %1
%endmacro
-;-----------------------------------------------------------------------------
-; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
-;-----------------------------------------------------------------------------
-%macro VAR2_8x8_MMX 2
-cglobal pixel_var2_8x%1, 5,6
- FIX_STRIDES r1, r3
- VAR_START 0
- mov r5d, %1
-.loop:
+%macro VAR_AVX512_CORE_8x8 1 ; accum
%if HIGH_BIT_DEPTH
- mova m0, [r0]
- mova m1, [r0+mmsize]
- psubw m0, [r2]
- psubw m1, [r2+mmsize]
-%else ; !HIGH_BIT_DEPTH
- movq m0, [r0]
- movq m1, m0
- movq m2, [r2]
- movq m3, m2
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- psubw m0, m2
- psubw m1, m3
-%endif ; HIGH_BIT_DEPTH
- paddw m5, m0
- paddw m5, m1
- pmaddwd m0, m0
- pmaddwd m1, m1
- paddd m6, m0
- paddd m6, m1
- add r0, r1
- add r2, r3
- dec r5d
- jg .loop
- VAR2_END %2, m5, m6
+ mova xm2, [r0]
+ mova xm3, [r0+r1]
+%else
+ movq xm2, [r0]
+ movq xm3, [r0+r1]
+%endif
+ vinserti128 ym2, [r0+2*r1], 1
+ vinserti128 ym3, [r0+r2], 1
+ lea r0, [r0+4*r1]
+ vinserti32x4 m2, [r0], 2
+ vinserti32x4 m3, [r0+r1], 2
+ vinserti32x4 m2, [r0+2*r1], 3
+ vinserti32x4 m3, [r0+r2], 3
+%if HIGH_BIT_DEPTH == 0
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+%endif
+ VAR_AVX512_CORE %1
%endmacro
+INIT_ZMM avx512
+cglobal pixel_var_16x16, 2,4
+ FIX_STRIDES r1
+ mov r2d, 0xf0
+ lea r3, [3*r1]
+%if HIGH_BIT_DEPTH == 0
+ vbroadcasti64x4 m4, [var_shuf_avx512]
+ kmovb k1, r2d
+%endif
+ VAR_AVX512_CORE_16x16 0
+.loop:
+ lea r0, [r0+4*r1]
+ VAR_AVX512_CORE_16x16 1
+ sub r2d, 0x50
+ jg .loop
%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-VAR2_8x8_MMX 8, 6
-VAR2_8x8_MMX 16, 7
+ pop r3d
+ %assign regs_used 3
+%endif
+var_avx512_end:
+ vbroadcasti32x4 m2, [pw_1]
+ pmaddwd m0, m2
+ SBUTTERFLY dq, 0, 1, 2
+ paddd m0, m1
+ vextracti32x8 ym1, m0, 1
+ paddd ym0, ym1
+ vextracti128 xm1, ym0, 1
+ paddd xmm0, xm0, xm1
+ punpckhqdq xmm1, xmm0, xmm0
+ paddd xmm0, xmm1
+%if ARCH_X86_64
+ movq rax, xmm0
+%else
+ movd eax, xmm0
+ pextrd edx, xmm0, 1
%endif
+ RET
+
+%if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth
+cglobal pixel_var_8x8, 2,3
+ lea r2, [3*r1]
+ pxor xm4, xm4
+ VAR_AVX512_CORE_8x8 0
+ jmp var_avx512_end
+%endif
+
+cglobal pixel_var_8x16, 2,3
+ FIX_STRIDES r1
+ lea r2, [3*r1]
+%if HIGH_BIT_DEPTH == 0
+ pxor xm4, xm4
+%endif
+ VAR_AVX512_CORE_8x8 0
+ lea r0, [r0+4*r1]
+ VAR_AVX512_CORE_8x8 1
+ jmp var_avx512_end
+
+;-----------------------------------------------------------------------------
+; int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] )
+;-----------------------------------------------------------------------------
+
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6
+%else
+ DECLARE_REG_TMP 2
+%endif
+
+%macro VAR2_END 3 ; src, tmp, shift
+ movifnidn r2, r2mp
+ pshufd %2, %1, q3331
+ pmuludq %1, %1
+ movq [r2], %2 ; sqr_u sqr_v
+ psrld %1, %3
+ psubd %2, %1 ; sqr - (sum * sum >> shift)
+ MOVHL %1, %2
+ paddd %1, %2
+ movd eax, %1
+ RET
+%endmacro
%macro VAR2_8x8_SSE2 2
-cglobal pixel_var2_8x%1, 5,6,8
- VAR_START 1
- mov r5d, %1/2
+%if HIGH_BIT_DEPTH
+cglobal pixel_var2_8x%1, 2,3,6
+ pxor m4, m4
+ pxor m5, m5
+%define %%sum2 m4
+%define %%sqr2 m5
+%else
+cglobal pixel_var2_8x%1, 2,3,7
+ mova m6, [pw_00ff]
+%define %%sum2 m0
+%define %%sqr2 m1
+%endif
+ pxor m0, m0 ; sum
+ pxor m1, m1 ; sqr
+ mov t0d, (%1-1)*FENC_STRIDEB
.loop:
%if HIGH_BIT_DEPTH
- mova m0, [r0]
- mova m1, [r0+r1*2]
- mova m2, [r2]
- mova m3, [r2+r3*2]
-%else ; !HIGH_BIT_DEPTH
- movq m1, [r0]
- movhps m1, [r0+r1]
- movq m3, [r2]
- movhps m3, [r2+r3]
- DEINTB 0, 1, 2, 3, 7
-%endif ; HIGH_BIT_DEPTH
- psubw m0, m2
- psubw m1, m3
- paddw m5, m0
- paddw m5, m1
- pmaddwd m0, m0
- pmaddwd m1, m1
- paddd m6, m0
- paddd m6, m1
- lea r0, [r0+r1*2*SIZEOF_PIXEL]
- lea r2, [r2+r3*2*SIZEOF_PIXEL]
- dec r5d
- jg .loop
- VAR2_END %2, m5, m6
+ mova m2, [r0+1*t0]
+ psubw m2, [r1+2*t0]
+ mova m3, [r0+1*t0+16]
+ psubw m3, [r1+2*t0+32]
+%else
+ mova m3, [r0+1*t0]
+ movq m5, [r1+2*t0]
+ punpcklqdq m5, [r1+2*t0+16]
+ DEINTB 2, 3, 4, 5, 6
+ psubw m2, m4
+ psubw m3, m5
+%endif
+ paddw m0, m2
+ pmaddwd m2, m2
+ paddw %%sum2, m3
+ pmaddwd m3, m3
+ paddd m1, m2
+ paddd %%sqr2, m3
+ sub t0d, FENC_STRIDEB
+ jge .loop
+%if HIGH_BIT_DEPTH
+ SBUTTERFLY dq, 0, 4, 2
+ paddw m0, m4 ; sum_u sum_v
+ pmaddwd m0, [pw_1]
+ SBUTTERFLY dq, 1, 5, 2
+ paddd m1, m5 ; sqr_u sqr_v
+ SBUTTERFLY dq, 0, 1, 2
+ paddd m0, m1
+%else
+ pmaddwd m0, [pw_1]
+ shufps m2, m0, m1, q2020
+ shufps m0, m1, q3131
+ paddd m0, m2
+ pshufd m0, m0, q3120 ; sum_u sqr_u sum_v sqr_v
+%endif
+ VAR2_END m0, m1, %2
%endmacro
INIT_XMM sse2
VAR2_8x8_SSE2 8, 6
VAR2_8x8_SSE2 16, 7
+%macro VAR2_CORE 3 ; src1, src2, accum
+%if %3
+ paddw m0, %1
+ pmaddwd %1, %1
+ paddw m0, %2
+ pmaddwd %2, %2
+ paddd m1, %1
+ paddd m1, %2
+%else
+ paddw m0, %1, %2
+ pmaddwd %1, %1
+ pmaddwd %2, %2
+ paddd m1, %1, %2
+%endif
+%endmacro
+
%if HIGH_BIT_DEPTH == 0
-%macro VAR2_8x8_SSSE3 2
-cglobal pixel_var2_8x%1, 5,6,8
- pxor m5, m5 ; sum
- pxor m6, m6 ; sum squared
- mova m7, [hsub_mul]
- mov r5d, %1/4
+INIT_XMM ssse3
+cglobal pixel_var2_internal
+ pxor m0, m0 ; sum
+ pxor m1, m1 ; sqr
.loop:
- movq m0, [r0]
- movq m2, [r2]
- movq m1, [r0+r1]
- movq m3, [r2+r3]
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- punpcklbw m0, m2
- punpcklbw m1, m3
- movq m2, [r0]
- movq m3, [r2]
- punpcklbw m2, m3
- movq m3, [r0+r1]
- movq m4, [r2+r3]
- punpcklbw m3, m4
- pmaddubsw m0, m7
- pmaddubsw m1, m7
- pmaddubsw m2, m7
- pmaddubsw m3, m7
- paddw m5, m0
- paddw m5, m1
- paddw m5, m2
- paddw m5, m3
- pmaddwd m0, m0
- pmaddwd m1, m1
- pmaddwd m2, m2
- pmaddwd m3, m3
- paddd m6, m0
- paddd m6, m1
- paddd m6, m2
- paddd m6, m3
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- dec r5d
+ movq m2, [r0+1*t0]
+ punpcklbw m2, [r1+2*t0]
+ movq m3, [r0+1*t0-1*FENC_STRIDE]
+ punpcklbw m3, [r1+2*t0-1*FDEC_STRIDE]
+ movq m4, [r0+1*t0-2*FENC_STRIDE]
+ punpcklbw m4, [r1+2*t0-2*FDEC_STRIDE]
+ movq m5, [r0+1*t0-3*FENC_STRIDE]
+ punpcklbw m5, [r1+2*t0-3*FDEC_STRIDE]
+ pmaddubsw m2, m7
+ pmaddubsw m3, m7
+ pmaddubsw m4, m7
+ pmaddubsw m5, m7
+ VAR2_CORE m2, m3, 1
+ VAR2_CORE m4, m5, 1
+ sub t0d, 4*FENC_STRIDE
jg .loop
- VAR2_END %2, m5, m6
+ pmaddwd m0, [pw_1]
+ ret
+
+%macro VAR2_8x8_SSSE3 2
+cglobal pixel_var2_8x%1, 2,3,8
+ mova m7, [hsub_mul]
+ mov t0d, (%1-1)*FENC_STRIDE
+ call pixel_var2_internal_ssse3 ; u
+ add r0, 8
+ add r1, 16
+ SBUTTERFLY qdq, 0, 1, 6
+ paddd m1, m0
+ mov t0d, (%1-1)*FENC_STRIDE
+ call pixel_var2_internal_ssse3 ; v
+ SBUTTERFLY qdq, 0, 6, 2
+ paddd m0, m6
+ phaddd m1, m0 ; sum_u sqr_u sum_v sqr_v
+ VAR2_END m1, m0, %2
%endmacro
-INIT_XMM ssse3
-VAR2_8x8_SSSE3 8, 6
-VAR2_8x8_SSSE3 16, 7
-INIT_XMM xop
VAR2_8x8_SSSE3 8, 6
VAR2_8x8_SSSE3 16, 7
+%endif ; !HIGH_BIT_DEPTH
+
+%macro VAR2_AVX2_LOAD 3 ; offset_reg, row1_offset, row2_offset
+%if HIGH_BIT_DEPTH
+%if mmsize == 64
+ mova m2, [r1+2*%1+%2*FDEC_STRIDEB]
+ vshufi32x4 m2, [r1+2*%1+%2*FDEC_STRIDEB+64], q2020
+ mova m3, [r1+2*%1+%3*FDEC_STRIDEB]
+ vshufi32x4 m3, [r1+2*%1+%3*FDEC_STRIDEB+64], q2020
+%else
+ mova xm2, [r1+2*%1+%2*FDEC_STRIDEB]
+ vinserti128 m2, [r1+2*%1+%2*FDEC_STRIDEB+32], 1
+ mova xm3, [r1+2*%1+%3*FDEC_STRIDEB]
+ vinserti128 m3, [r1+2*%1+%3*FDEC_STRIDEB+32], 1
+%endif
+ psubw m2, [r0+1*%1+%2*FENC_STRIDEB]
+ psubw m3, [r0+1*%1+%3*FENC_STRIDEB]
+%else
+ pmovzxbw m2, [r0+1*%1+%2*FENC_STRIDE]
+ mova m4, [r1+2*%1+%2*FDEC_STRIDE]
+ pmovzxbw m3, [r0+1*%1+%3*FENC_STRIDE]
+ mova m5, [r1+2*%1+%3*FDEC_STRIDE]
+ punpcklbw m4, m6
+ punpcklbw m5, m6
+ psubw m2, m4
+ psubw m3, m5
+%endif
+%endmacro
%macro VAR2_8x8_AVX2 2
-cglobal pixel_var2_8x%1, 5,6,6
- pxor m3, m3 ; sum
- pxor m4, m4 ; sum squared
- mova m5, [hsub_mul]
- mov r5d, %1/4
+%if HIGH_BIT_DEPTH
+cglobal pixel_var2_8x%1, 2,3,4
+%else
+cglobal pixel_var2_8x%1, 2,3,7
+ pxor m6, m6
+%endif
+ mov t0d, (%1-3)*FENC_STRIDEB
+ VAR2_AVX2_LOAD t0, 2, 1
+ VAR2_CORE m2, m3, 0
.loop:
- movq xm0, [r0]
- movq xm1, [r2]
- vinserti128 m0, m0, [r0+r1], 1
- vinserti128 m1, m1, [r2+r3], 1
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- punpcklbw m0, m1
- movq xm1, [r0]
- movq xm2, [r2]
- vinserti128 m1, m1, [r0+r1], 1
- vinserti128 m2, m2, [r2+r3], 1
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- punpcklbw m1, m2
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- paddw m3, m0
- paddw m3, m1
- pmaddwd m0, m0
- pmaddwd m1, m1
- paddd m4, m0
- paddd m4, m1
- dec r5d
+ VAR2_AVX2_LOAD t0, 0, -1
+ VAR2_CORE m2, m3, 1
+ sub t0d, 2*FENC_STRIDEB
jg .loop
- vextracti128 xm0, m3, 1
- vextracti128 xm1, m4, 1
- paddw xm3, xm0
- paddd xm4, xm1
- VAR2_END %2, xm3, xm4
+
+ pmaddwd m0, [pw_1]
+ SBUTTERFLY qdq, 0, 1, 2
+ paddd m0, m1
+ vextracti128 xm1, m0, 1
+ phaddd xm0, xm1
+ VAR2_END xm0, xm1, %2
%endmacro
INIT_YMM avx2
VAR2_8x8_AVX2 8, 6
VAR2_8x8_AVX2 16, 7
-%endif ; !HIGH_BIT_DEPTH
+%macro VAR2_AVX512_END 1 ; shift
+ vbroadcasti32x4 m2, [pw_1]
+ pmaddwd m0, m2
+ SBUTTERFLY qdq, 0, 1, 2
+ paddd m0, m1
+ vextracti32x8 ym1, m0, 1
+ paddd ym0, ym1
+ psrlq ym1, ym0, 32
+ paddd ym0, ym1
+ vpmovqd xmm0, ym0 ; sum_u, sqr_u, sum_v, sqr_v
+ VAR2_END xmm0, xmm1, %1
+%endmacro
+
+INIT_ZMM avx512
+cglobal pixel_var2_8x8, 2,3
+%if HIGH_BIT_DEPTH == 0
+ pxor xm6, xm6
+%endif
+ VAR2_AVX2_LOAD 0, 0, 2
+ VAR2_CORE m2, m3, 0
+ VAR2_AVX2_LOAD 0, 4, 6
+ VAR2_CORE m2, m3, 1
+ VAR2_AVX512_END 6
+
+cglobal pixel_var2_8x16, 2,3
+%if HIGH_BIT_DEPTH == 0
+ pxor xm6, xm6
+%endif
+ mov t0d, 10*FENC_STRIDEB
+ VAR2_AVX2_LOAD 0, 14, 12
+ VAR2_CORE m2, m3, 0
+.loop:
+ VAR2_AVX2_LOAD t0, 0, -2
+ VAR2_CORE m2, m3, 1
+ sub t0d, 4*FENC_STRIDEB
+ jg .loop
+ VAR2_AVX512_END 7
;=============================================================================
; SATD
@@ -4583,6 +4700,244 @@ cglobal intra_sad_x9_8x8, 5,7,8
mov rsp, r6
mov eax, r2d
RET
+
+%macro SATD_AVX512_LOAD4 2 ; size, opmask
+ vpbroadcast%1 m0, [r0]
+ vpbroadcast%1 m0 {%2}, [r0+2*r1]
+ vpbroadcast%1 m2, [r2]
+ vpbroadcast%1 m2 {%2}, [r2+2*r3]
+ add r0, r1
+ add r2, r3
+ vpbroadcast%1 m1, [r0]
+ vpbroadcast%1 m1 {%2}, [r0+2*r1]
+ vpbroadcast%1 m3, [r2]
+ vpbroadcast%1 m3 {%2}, [r2+2*r3]
+%endmacro
+
+%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3
+ vpbroadcast%1 %{2}0, [r0]
+ vpbroadcast%1 %{2}0 {%3}, [r0+2*r1]
+ vpbroadcast%1 %{2}2, [r2]
+ vpbroadcast%1 %{2}2 {%3}, [r2+2*r3]
+ vpbroadcast%1 m0 {%4}, [r0+4*r1]
+ vpbroadcast%1 m2 {%4}, [r2+4*r3]
+ vpbroadcast%1 m0 {%5}, [r0+2*r4]
+ vpbroadcast%1 m2 {%5}, [r2+2*r5]
+ vpbroadcast%1 %{2}1, [r0+r1]
+ vpbroadcast%1 %{2}1 {%3}, [r0+r4]
+ vpbroadcast%1 %{2}3, [r2+r3]
+ vpbroadcast%1 %{2}3 {%3}, [r2+r5]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ vpbroadcast%1 m1 {%4}, [r0+r1]
+ vpbroadcast%1 m3 {%4}, [r2+r3]
+ vpbroadcast%1 m1 {%5}, [r0+r4]
+ vpbroadcast%1 m3 {%5}, [r2+r5]
+%endmacro
+
+%macro SATD_AVX512_PACKED 0
+ DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
+ SUMSUB_BA w, 0, 1, 2
+ SBUTTERFLY qdq, 0, 1, 2
+ SUMSUB_BA w, 0, 1, 2
+ HMAXABSW2 0, 1, 2, 3
+%endmacro
+
+%macro SATD_AVX512_END 0-1 0 ; sa8d
+ paddw m0 {k1}{z}, m1 ; zero-extend to dwords
+%if ARCH_X86_64
+%if mmsize == 64
+ vextracti32x8 ym1, m0, 1
+ paddd ym0, ym1
+%endif
+%if mmsize >= 32
+ vextracti128 xm1, ym0, 1
+ paddd xmm0, xm0, xm1
+%endif
+ punpckhqdq xmm1, xmm0, xmm0
+ paddd xmm0, xmm1
+ movq rax, xmm0
+ rorx rdx, rax, 32
+%if %1
+ lea eax, [rax+rdx+1]
+ shr eax, 1
+%else
+ add eax, edx
+%endif
+%else
+ HADDD m0, m1
+ movd eax, xm0
+%if %1
+ inc eax
+ shr eax, 1
+%endif
+%endif
+ RET
+%endmacro
+
+%macro HMAXABSW2 4 ; a, b, tmp1, tmp2
+ pabsw m%1, m%1
+ pabsw m%2, m%2
+ psrldq m%3, m%1, 2
+ psrld m%4, m%2, 16
+ pmaxsw m%1, m%3
+ pmaxsw m%2, m%4
+%endmacro
+
+INIT_ZMM avx512
+cglobal pixel_satd_16x8_internal
+ vbroadcasti64x4 m6, [hmul_16p]
+ kxnorb k2, k2, k2
+ mov r4d, 0x55555555
+ knotw k2, k2
+ kmovd k1, r4d
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+satd_16x8_avx512:
+ vbroadcasti128 ym0, [r0]
+ vbroadcasti32x4 m0 {k2}, [r0+4*r1] ; 0 0 4 4
+ vbroadcasti128 ym4, [r2]
+ vbroadcasti32x4 m4 {k2}, [r2+4*r3]
+ vbroadcasti128 ym2, [r0+2*r1]
+ vbroadcasti32x4 m2 {k2}, [r0+2*r4] ; 2 2 6 6
+ vbroadcasti128 ym5, [r2+2*r3]
+ vbroadcasti32x4 m5 {k2}, [r2+2*r5]
+ DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6
+ vbroadcasti128 ym1, [r0+r1]
+ vbroadcasti128 ym4, [r2+r3]
+ vbroadcasti128 ym3, [r0+r4]
+ vbroadcasti128 ym5, [r2+r5]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ vbroadcasti32x4 m1 {k2}, [r0+r1] ; 1 1 5 5
+ vbroadcasti32x4 m4 {k2}, [r2+r3]
+ vbroadcasti32x4 m3 {k2}, [r0+r4] ; 3 3 7 7
+ vbroadcasti32x4 m5 {k2}, [r2+r5]
+ DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6
+ HADAMARD4_V 0, 1, 2, 3, 4
+ HMAXABSW2 0, 2, 4, 5
+ HMAXABSW2 1, 3, 4, 5
+ paddw m4, m0, m2 ; m1
+ paddw m2, m1, m3 ; m0
+ ret
+
+cglobal pixel_satd_8x8_internal
+ vbroadcasti64x4 m4, [hmul_16p]
+ mov r4d, 0x55555555
+ kmovd k1, r4d ; 01010101
+ kshiftlb k2, k1, 5 ; 10100000
+ kshiftlb k3, k1, 4 ; 01010000
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+satd_8x8_avx512:
+ SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
+ SATD_AVX512_PACKED ; 3 1 3 1 7 5 7 5
+ ret
+
+cglobal pixel_satd_16x8, 4,6
+ call pixel_satd_16x8_internal_avx512
+ jmp satd_zmm_avx512_end
+
+cglobal pixel_satd_16x16, 4,6
+ call pixel_satd_16x8_internal_avx512
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ paddw m7, m0, m1
+ call satd_16x8_avx512
+ paddw m1, m7
+ jmp satd_zmm_avx512_end
+
+cglobal pixel_satd_8x8, 4,6
+ call pixel_satd_8x8_internal_avx512
+satd_zmm_avx512_end:
+ SATD_AVX512_END
+
+cglobal pixel_satd_8x16, 4,6
+ call pixel_satd_8x8_internal_avx512
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ paddw m5, m0, m1
+ call satd_8x8_avx512
+ paddw m1, m5
+ jmp satd_zmm_avx512_end
+
+INIT_YMM avx512
+cglobal pixel_satd_4x8_internal
+ vbroadcasti128 m4, [hmul_4p]
+ mov r4d, 0x55550c
+ kmovd k2, r4d ; 00001100
+ kshiftlb k3, k2, 2 ; 00110000
+ kshiftlb k4, k2, 4 ; 11000000
+ kshiftrd k1, k2, 8 ; 01010101
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+satd_4x8_avx512:
+ SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6
+satd_ymm_avx512: ; 1 1 3 3 5 5 7 7
+ SATD_AVX512_PACKED
+ ret
+
+cglobal pixel_satd_8x4, 4,5
+ mova m4, [hmul_16p]
+ mov r4d, 0x5555
+ kmovw k1, r4d
+ SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0
+ call satd_ymm_avx512 ; 3 1 3 1
+ jmp satd_ymm_avx512_end2
+
+cglobal pixel_satd_4x8, 4,6
+ call pixel_satd_4x8_internal_avx512
+satd_ymm_avx512_end:
+%if ARCH_X86_64 == 0
+ pop r5d
+ %assign regs_used 5
+%endif
+satd_ymm_avx512_end2:
+ SATD_AVX512_END
+
+cglobal pixel_satd_4x16, 4,6
+ call pixel_satd_4x8_internal_avx512
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ paddw m5, m0, m1
+ call satd_4x8_avx512
+ paddw m1, m5
+ jmp satd_ymm_avx512_end
+
+INIT_XMM avx512
+cglobal pixel_satd_4x4, 4,5
+ mova m4, [hmul_4p]
+ mov r4d, 0x550c
+ kmovw k2, r4d
+ kshiftrw k1, k2, 8
+ SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2
+ SATD_AVX512_PACKED ; 1 1 3 3
+ SWAP 0, 1
+ SATD_AVX512_END
+
+INIT_ZMM avx512
+cglobal pixel_sa8d_8x8, 4,6
+ vbroadcasti64x4 m4, [hmul_16p]
+ mov r4d, 0x55555555
+ kmovd k1, r4d ; 01010101
+ kshiftlb k2, k1, 5 ; 10100000
+ kshiftlb k3, k1, 4 ; 01010000
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
+ DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5
+ SUMSUB_BA w, 0, 1, 2
+ SBUTTERFLY qdq, 0, 1, 2
+ SUMSUB_BA w, 0, 1, 2
+ shufps m2, m0, m1, q2020
+ shufps m1, m0, m1, q3131
+ SUMSUB_BA w, 2, 1, 0
+ vshufi32x4 m0, m2, m1, q1010
+ vshufi32x4 m1, m2, m1, q3232
+ SUMSUB_BA w, 0, 1, 2
+ HMAXABSW2 0, 1, 2, 3
+ SATD_AVX512_END 1
+
%endif ; HIGH_BIT_DEPTH
;=============================================================================
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 2b0baa3..56cfc5c 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -52,6 +52,7 @@ DECL_X1( sad, sse2_aligned )
DECL_X1( sad, ssse3 )
DECL_X1( sad, ssse3_aligned )
DECL_X1( sad, avx2 )
+DECL_X1( sad, avx512 )
DECL_X4( sad, mmx2 )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
@@ -59,6 +60,7 @@ DECL_X4( sad, ssse3 )
DECL_X4( sad, xop )
DECL_X4( sad, avx )
DECL_X4( sad, avx2 )
+DECL_X4( sad, avx512 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, mmx2 )
DECL_X1( ssd, sse2slow )
@@ -75,6 +77,7 @@ DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
DECL_X1( satd, xop )
DECL_X1( satd, avx2 )
+DECL_X1( satd, avx512 )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
@@ -83,6 +86,7 @@ DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
DECL_X1( sa8d, xop )
DECL_X1( sa8d, avx2 )
+DECL_X1( sa8d, avx512 )
DECL_X1( sad, cache32_mmx2 );
DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
@@ -92,11 +96,10 @@ DECL_X4( sad, cache64_mmx2 );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
-DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride ))
-DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, avx512, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
@@ -165,16 +168,14 @@ void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1,
const pixel *pix2, intptr_t stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
-int x264_pixel_var2_8x8_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
-int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
-int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x8_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
-int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
-int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
+int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
+int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
+int x264_pixel_var2_8x8_avx512 ( pixel *fenc, pixel *fdec, int ssd[2] );
+int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
+int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
+int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
+int x264_pixel_var2_8x16_avx512( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 2391b57..f8ebbe5 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -30,7 +30,14 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA 32
+SECTION_RODATA 64
+
+%if HIGH_BIT_DEPTH
+decimate_shuf_avx512: dd 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15
+%else
+dequant_shuf_avx512: dw 0, 2, 4, 6, 8,10,12,14,16,18,20,22,24,26,28,30
+ dw 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62
+%endif
%macro DQM4 3
dw %1, %2, %1, %2, %2, %3, %2, %3
@@ -42,14 +49,6 @@ SECTION_RODATA 32
dw %4, %2, %6, %2, %4, %2, %6, %2
%endmacro
-dequant4_scale:
- DQM4 10, 13, 16
- DQM4 11, 14, 18
- DQM4 13, 16, 20
- DQM4 14, 18, 23
- DQM4 16, 20, 25
- DQM4 18, 23, 29
-
dequant8_scale:
DQM8 20, 18, 32, 19, 25, 24
DQM8 22, 19, 35, 21, 28, 26
@@ -58,6 +57,14 @@ dequant8_scale:
DQM8 32, 28, 51, 30, 40, 38
DQM8 36, 32, 58, 34, 46, 43
+dequant4_scale:
+ DQM4 10, 13, 16
+ DQM4 11, 14, 18
+ DQM4 13, 16, 20
+ DQM4 14, 18, 23
+ DQM4 16, 20, 25
+ DQM4 18, 23, 29
+
decimate_mask_table4:
db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
@@ -743,6 +750,163 @@ DEQUANT 4, 4, 4
DEQUANT 8, 6, 4
%endif
+%macro DEQUANT_START_AVX512 1-2 0 ; shift, flat
+%if %2 == 0
+ movifnidn t2d, r2m
+%endif
+ imul t0d, t2d, 0x2b
+ shr t0d, 8 ; i_qbits = i_qp / 6
+ lea t1d, [t0*5]
+ sub t2d, t0d
+ sub t2d, t1d ; i_mf = i_qp % 6
+ shl t2d, %1
+%if %2
+%ifdef PIC
+%define dmf r1+t2
+ lea r1, [dequant8_scale]
+%else
+%define dmf t2+dequant8_scale
+%endif
+%elif ARCH_X86_64
+%define dmf r1+t2
+%else
+%define dmf r1
+ add r1, r1mp ; dequant_mf[i_mf]
+%endif
+ movifnidn r0, r0mp
+%endmacro
+
+INIT_ZMM avx512
+cglobal dequant_4x4, 0,3
+ DEQUANT_START_AVX512 6
+ mova m0, [dmf]
+%if HIGH_BIT_DEPTH
+ pmaddwd m0, [r0]
+%endif
+ sub t0d, 4
+ jl .rshift
+%if HIGH_BIT_DEPTH
+ vpbroadcastd m1, t0d
+ vpsllvd m0, m1
+ mova [r0], m0
+%else
+ vpbroadcastw ym1, t0d
+ vpmovsdw ym0, m0
+ pmullw ym0, [r0]
+ vpsllvw ym0, ym1
+ mova [r0], ym0
+%endif
+ RET
+.rshift:
+%if HIGH_BIT_DEPTH == 0
+ pmovzxwd m1, [r0]
+ pmaddwd m0, m1
+%endif
+ mov r1d, 1<<31
+ shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
+ neg t0d
+ vpbroadcastd m1, r1d
+ vpbroadcastd m2, t0d
+ paddd m0, m1
+ vpsravd m0, m2
+%if HIGH_BIT_DEPTH
+ mova [r0], m0
+%else
+ vpmovsdw [r0], m0
+%endif
+ RET
+
+cglobal dequant_8x8, 0,3
+ DEQUANT_START_AVX512 8
+ mova m0, [dmf+0*64]
+ mova m1, [dmf+1*64]
+ mova m2, [dmf+2*64]
+ mova m3, [dmf+3*64]
+%if HIGH_BIT_DEPTH
+ pmaddwd m0, [r0+0*64]
+ pmaddwd m1, [r0+1*64]
+ pmaddwd m2, [r0+2*64]
+ pmaddwd m3, [r0+3*64]
+%else
+ mova m6, [dequant_shuf_avx512]
+%endif
+ sub t0d, 6
+ jl .rshift
+%if HIGH_BIT_DEPTH
+ vpbroadcastd m4, t0d
+ vpsllvd m0, m4
+ vpsllvd m1, m4
+ vpsllvd m2, m4
+ vpsllvd m3, m4
+ jmp .end
+.rshift:
+%else
+ vpbroadcastw m4, t0d
+ vpermt2w m0, m6, m1
+ vpermt2w m2, m6, m3
+ pmullw m0, [r0]
+ pmullw m2, [r0+64]
+ vpsllvw m0, m4
+ vpsllvw m2, m4
+ mova [r0], m0
+ mova [r0+64], m2
+ RET
+.rshift:
+ pmovzxwd m4, [r0+0*32]
+ pmovzxwd m5, [r0+1*32]
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmovzxwd m4, [r0+2*32]
+ pmovzxwd m5, [r0+3*32]
+ pmaddwd m2, m4
+ pmaddwd m3, m5
+%endif
+ mov r1d, 1<<31
+ shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
+ neg t0d
+ vpbroadcastd m4, r1d
+ vpbroadcastd m5, t0d
+ paddd m0, m4
+ paddd m1, m4
+ vpsravd m0, m5
+ vpsravd m1, m5
+ paddd m2, m4
+ paddd m3, m4
+ vpsravd m2, m5
+ vpsravd m3, m5
+%if HIGH_BIT_DEPTH
+.end:
+ mova [r0+0*64], m0
+ mova [r0+1*64], m1
+ mova [r0+2*64], m2
+ mova [r0+3*64], m3
+%else
+ vpermt2w m0, m6, m1
+ vpermt2w m2, m6, m3
+ mova [r0], m0
+ mova [r0+64], m2
+%endif
+ RET
+
+%if HIGH_BIT_DEPTH == 0
+cglobal dequant_8x8_flat16, 0,3
+ movifnidn t2d, r2m
+ cmp t2d, 12
+ jl dequant_8x8_avx512
+ sub t2d, 12
+ DEQUANT_START_AVX512 6, 1
+ vpbroadcastw m0, t0d
+ mova m1, [dmf]
+ vpsllvw m1, m0
+ pmullw m0, m1, [r0]
+ pmullw m1, [r0+64]
+ mova [r0], m0
+ mova [r0+64], m1
+ RET
+%endif
+
+%undef dmf
+
%macro DEQUANT_DC 2
cglobal dequant_4x4dc, 0,3,6
DEQUANT_START 6, 6
@@ -1208,13 +1372,12 @@ cglobal denoise_dct, 4,4,4
; int decimate_score( dctcoef *dct )
;-----------------------------------------------------------------------------
-%macro DECIMATE_MASK 5
-%if mmsize==16
+%macro DECIMATE_MASK 4
%if HIGH_BIT_DEPTH
- movdqa m0, [%3+ 0]
- movdqa m1, [%3+32]
- packssdw m0, [%3+16]
- packssdw m1, [%3+48]
+ mova m0, [%3+0*16]
+ packssdw m0, [%3+1*16]
+ mova m1, [%3+2*16]
+ packssdw m1, [%3+3*16]
ABSW2 m0, m1, m0, m1, m3, m4
%else
ABSW m0, [%3+ 0], m3
@@ -1226,40 +1389,35 @@ cglobal denoise_dct, 4,4,4
pcmpgtb m0, %4
pmovmskb %1, m2
pmovmskb %2, m0
-%else ; mmsize==8
+%endmacro
+
+%macro DECIMATE_MASK16_AVX512 0
+ mova m0, [r0]
%if HIGH_BIT_DEPTH
- movq m0, [%3+ 0]
- movq m1, [%3+16]
- movq m2, [%3+32]
- movq m3, [%3+48]
- packssdw m0, [%3+ 8]
- packssdw m1, [%3+24]
- packssdw m2, [%3+40]
- packssdw m3, [%3+56]
-%else
- movq m0, [%3+ 0]
- movq m1, [%3+ 8]
- movq m2, [%3+16]
- movq m3, [%3+24]
-%endif
- ABSW2 m0, m1, m0, m1, m6, m7
- ABSW2 m2, m3, m2, m3, m6, m7
- packsswb m0, m1
- packsswb m2, m3
- pxor m4, m4
- pxor m6, m6
- pcmpeqb m4, m0
- pcmpeqb m6, m2
- pcmpgtb m0, %4
- pcmpgtb m2, %4
- pmovmskb %5, m4
- pmovmskb %1, m6
- shl %1, 8
- or %1, %5
- pmovmskb %5, m0
- pmovmskb %2, m2
- shl %2, 8
- or %2, %5
+ vptestmd k0, m0, m0
+ pabsd m0, m0
+ vpcmpud k1, m0, [pd_1] {1to16}, 6
+%else
+ vptestmw k0, m0, m0
+ pabsw m0, m0
+ vpcmpuw k1, m0, [pw_1], 6
+%endif
+%endmacro
+
+%macro SHRX 2
+%if cpuflag(bmi2)
+ shrx %1, %1, %2
+%else
+ shr %1, %2b ; %2 has to be rcx/ecx
+%endif
+%endmacro
+
+%macro BLSR 2
+%if cpuflag(bmi1)
+ blsr %1, %2
+%else
+ lea %1, [%2-1]
+ and %1, %2
%endif
%endmacro
@@ -1269,33 +1427,60 @@ cextern decimate_table8
%macro DECIMATE4x4 1
cglobal decimate_score%1, 1,3
-%ifdef PIC
- lea r4, [decimate_table4]
- lea r5, [decimate_mask_table4]
- %define table r4
- %define mask_table r5
+%if cpuflag(avx512)
+ DECIMATE_MASK16_AVX512
+ xor eax, eax
+ kmovw edx, k0
+%if %1 == 15
+ shr edx, 1
%else
- %define table decimate_table4
- %define mask_table decimate_mask_table4
+ test edx, edx
%endif
- DECIMATE_MASK edx, eax, r0, [pb_1], ecx
+ jz .ret
+ ktestw k1, k1
+ jnz .ret9
+%else
+ DECIMATE_MASK edx, eax, r0, [pb_1]
xor edx, 0xffff
- je .ret
+ jz .ret
test eax, eax
- jne .ret9
-%if %1==15
+ jnz .ret9
+%if %1 == 15
shr edx, 1
%endif
+%endif
+%ifdef PIC
+ lea r4, [decimate_mask_table4]
+ %define mask_table r4
+%else
+ %define mask_table decimate_mask_table4
+%endif
movzx ecx, dl
movzx eax, byte [mask_table + rcx]
+%if ARCH_X86_64
+ xor edx, ecx
+ jz .ret
+%if cpuflag(lzcnt)
+ lzcnt ecx, ecx
+ lea r5, [decimate_table4-32]
+ add r5, rcx
+%else
+ bsr ecx, ecx
+ lea r5, [decimate_table4-1]
+ sub r5, rcx
+%endif
+ %define table r5
+%else
cmp edx, ecx
- je .ret
+ jz .ret
bsr ecx, ecx
shr edx, 1
- shr edx, cl
+ SHRX edx, ecx
+ %define table decimate_table4
+%endif
tzcnt ecx, edx
shr edx, 1
- shr edx, cl
+ SHRX edx, ecx
add al, byte [table + rcx]
add al, byte [mask_table + rdx]
.ret:
@@ -1303,175 +1488,224 @@ cglobal decimate_score%1, 1,3
.ret9:
mov eax, 9
RET
-
%endmacro
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-DECIMATE4x4 15
-DECIMATE4x4 16
-%endif
-INIT_XMM sse2
-DECIMATE4x4 15
-DECIMATE4x4 16
-INIT_XMM ssse3
-DECIMATE4x4 15
-DECIMATE4x4 16
-
-; 2x gt1 output, 2x nz output, 1x mask
-%macro DECIMATE_MASK64_AVX2 5
- pabsw m0, [r0+ 0]
- pabsw m2, [r0+32]
- pabsw m1, [r0+64]
- pabsw m3, [r0+96]
- packsswb m0, m2
- packsswb m1, m3
- pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so
- pcmpgtb m3, m1, %5 ; we can save latency by doing them here
- pmovmskb %1, m2
- pmovmskb %2, m3
- or %1, %2
- jne .ret9
+%macro DECIMATE_MASK64_AVX2 2 ; nz_low, nz_high
+ mova m0, [r0+0*32]
+ packsswb m0, [r0+1*32]
+ mova m1, [r0+2*32]
+ packsswb m1, [r0+3*32]
+ mova m4, [pb_1]
+ pabsb m2, m0
+ pabsb m3, m1
+ por m2, m3 ; the > 1 checks don't care about order, so
+ ptest m4, m2 ; we can save latency by doing them here
+ jnc .ret9
vpermq m0, m0, q3120
vpermq m1, m1, q3120
pxor m4, m4
pcmpeqb m0, m4
pcmpeqb m1, m4
- pmovmskb %3, m0
- pmovmskb %4, m1
+ pmovmskb %1, m0
+ pmovmskb %2, m1
%endmacro
-%macro DECIMATE8x8 0
+%macro DECIMATE_MASK64_AVX512 0
+ mova m0, [r0]
+%if HIGH_BIT_DEPTH
+ packssdw m0, [r0+1*64]
+ mova m1, [r0+2*64]
+ packssdw m1, [r0+3*64]
+ packsswb m0, m1
+ vbroadcasti32x4 m1, [pb_1]
+ pabsb m2, m0
+ vpcmpub k0, m2, m1, 6
+ ktestq k0, k0
+ jnz .ret9
+ mova m1, [decimate_shuf_avx512]
+ vpermd m0, m1, m0
+ vptestmb k1, m0, m0
+%else
+ mova m1, [r0+64]
+ vbroadcasti32x4 m3, [pb_1]
+ packsswb m2, m0, m1
+ pabsb m2, m2
+ vpcmpub k0, m2, m3, 6
+ ktestq k0, k0
+ jnz .ret9
+ vptestmw k1, m0, m0
+ vptestmw k2, m1, m1
+%endif
+%endmacro
+%macro DECIMATE8x8 0
%if ARCH_X86_64
cglobal decimate_score64, 1,5
+%if mmsize == 64
+ DECIMATE_MASK64_AVX512
+ xor eax, eax
+%if HIGH_BIT_DEPTH
+ kmovq r1, k1
+ test r1, r1
+ jz .ret
+%else
+ kortestd k1, k2
+ jz .ret
+ kunpckdq k1, k2, k1
+ kmovq r1, k1
+%endif
+%elif mmsize == 32
+ DECIMATE_MASK64_AVX2 r1d, eax
+ not r1
+ shl rax, 32
+ xor r1, rax
+ jz .ret
+%else
+ mova m5, [pb_1]
+ DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5
+ test eax, eax
+ jnz .ret9
+ DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5
+ shl r2d, 16
+ or r1d, r2d
+ DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5
+ shl r2, 32
+ or eax, r3d
+ or r1, r2
+ DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5
+ not r1
+ shl r2, 48
+ xor r1, r2
+ jz .ret
+ add eax, r3d
+ jnz .ret9
+%endif
%ifdef PIC
lea r4, [decimate_table8]
%define table r4
%else
%define table decimate_table8
%endif
- mova m5, [pb_1]
-%if mmsize==32
- DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5
- shl r3, 32
- or r1, r3
- xor r1, -1
- je .ret
-%else
- DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
- test eax, eax
- jne .ret9
- DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
- shl r2d, 16
- or r1d, r2d
- DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
- shl r2, 32
- or eax, r3d
- or r1, r2
- DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
- shl r2, 48
- or r1, r2
- xor r1, -1
- je .ret
- add eax, r3d
- jne .ret9
-%endif
- mov al, -6
+ mov al, -6
.loop:
tzcnt rcx, r1
- shr r1, cl
- add al, byte [table + rcx]
- jge .ret9
- shr r1, 1
- jne .loop
- add al, 6
+ add al, byte [table + rcx]
+ jge .ret9
+ shr r1, 1
+ SHRX r1, rcx
+%if cpuflag(bmi2)
+ test r1, r1
+%endif
+ jnz .loop
+ add al, 6
.ret:
REP_RET
.ret9:
- mov eax, 9
+ mov eax, 9
RET
%else ; ARCH
-%if mmsize == 8
-cglobal decimate_score64, 1,6
+cglobal decimate_score64, 1,4
+%if mmsize == 64
+ DECIMATE_MASK64_AVX512
+ xor eax, eax
+%if HIGH_BIT_DEPTH
+ kshiftrq k2, k1, 32
+%endif
+ kmovd r2, k1
+ kmovd r3, k2
+ test r2, r2
+ jz .tryret
+%elif mmsize == 32
+ DECIMATE_MASK64_AVX2 r2, r3
+ xor eax, eax
+ not r3
+ xor r2, -1
+ jz .tryret
%else
-cglobal decimate_score64, 1,5
-%endif
- mova m5, [pb_1]
-%if mmsize==32
- DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5
- xor r3, -1
- je .tryret
- xor r4, -1
-.cont:
-%else
- DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
- test r2, r2
- jne .ret9
- DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
- shl r4, 16
- or r3, r4
- DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
- or r2, r1
- DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
- shl r1, 16
- or r4, r1
- xor r3, -1
- je .tryret
- xor r4, -1
-.cont:
- add r0, r2
- jne .ret9
-%endif
- mov al, -6
+ mova m5, [pb_1]
+ DECIMATE_MASK r2, r1, r0+SIZEOF_DCTCOEF* 0, m5
+ test r1, r1
+ jnz .ret9
+ DECIMATE_MASK r3, r1, r0+SIZEOF_DCTCOEF*16, m5
+ not r2
+ shl r3, 16
+ xor r2, r3
+ mov r0m, r2
+ DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF*32, m5
+ or r2, r1
+ DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5
+ add r0, r2
+ jnz .ret9
+ mov r2, r0m
+ not r3
+ shl r1, 16
+ xor r3, r1
+ test r2, r2
+ jz .tryret
+%endif
+ mov al, -6
.loop:
+ tzcnt ecx, r2
+ add al, byte [decimate_table8 + ecx]
+ jge .ret9
+ sub ecx, 31 ; increase the shift count by one to shift away the lowest set bit as well
+ jz .run31 ; only bits 0-4 are used so we have to explicitly handle the case of 1<<31
+ shrd r2, r3, cl
+ SHRX r3, ecx
+%if notcpuflag(bmi2)
+ test r2, r2
+%endif
+ jnz .loop
+ BLSR r2, r3
+ jz .end
+.largerun:
tzcnt ecx, r3
- test r3, r3
- je .largerun
- shrd r3, r4, cl
- shr r4, cl
- add al, byte [decimate_table8 + ecx]
- jge .ret9
- shrd r3, r4, 1
- shr r4, 1
- test r3, r3
- jne .loop
- test r4, r4
- jne .loop
- add al, 6
-.ret:
- REP_RET
-.tryret:
- xor r4, -1
- jne .cont
+ shr r3, 1
+ SHRX r3, ecx
+.loop2:
+ tzcnt ecx, r3
+ add al, byte [decimate_table8 + ecx]
+ jge .ret9
+ shr r3, 1
+ SHRX r3, ecx
+.run31:
+ test r3, r3
+ jnz .loop2
+.end:
+ add al, 6
RET
+.tryret:
+ BLSR r2, r3
+ jz .ret
+ mov al, -6
+ jmp .largerun
.ret9:
mov eax, 9
- RET
-.largerun:
- mov r3, r4
- xor r4, r4
- tzcnt ecx, r3
- shr r3, cl
- shr r3, 1
- jne .loop
- add al, 6
- RET
+.ret:
+ REP_RET
%endif ; ARCH
-
%endmacro
-%if ARCH_X86_64 == 0
-INIT_MMX mmx2
-DECIMATE8x8
-%endif
INIT_XMM sse2
+DECIMATE4x4 15
+DECIMATE4x4 16
DECIMATE8x8
INIT_XMM ssse3
+DECIMATE4x4 15
+DECIMATE4x4 16
DECIMATE8x8
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+%else
INIT_YMM avx2
DECIMATE8x8
+INIT_YMM avx512
+%endif
+DECIMATE4x4 15
+DECIMATE4x4 16
+INIT_ZMM avx512
+DECIMATE8x8
;-----------------------------------------------------------------------------
; int coeff_last( dctcoef *dct )
@@ -1556,7 +1790,7 @@ cglobal coeff_last4, 1,3
INIT_MMX mmx2
COEFF_LAST4
-INIT_MMX mmx2, lzcnt
+INIT_MMX lzcnt
COEFF_LAST4
%macro COEFF_LAST8 0
@@ -1579,7 +1813,7 @@ COEFF_LAST8
%endif
INIT_XMM sse2
COEFF_LAST8
-INIT_XMM sse2, lzcnt
+INIT_XMM lzcnt
COEFF_LAST8
%else ; !HIGH_BIT_DEPTH
@@ -1642,7 +1876,7 @@ cglobal coeff_last8, 1,3
INIT_MMX mmx2
COEFF_LAST48
-INIT_MMX mmx2, lzcnt
+INIT_MMX lzcnt
COEFF_LAST48
%endif ; HIGH_BIT_DEPTH
@@ -1707,7 +1941,7 @@ COEFF_LAST
%endif
INIT_XMM sse2
COEFF_LAST
-INIT_XMM sse2, lzcnt
+INIT_XMM lzcnt
COEFF_LAST
%macro LAST_MASK_AVX2 2
@@ -1729,7 +1963,7 @@ COEFF_LAST
%endmacro
%if ARCH_X86_64 == 0
-INIT_YMM avx2,lzcnt
+INIT_YMM avx2
cglobal coeff_last64, 1,2
pxor m2, m2
LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
@@ -1744,7 +1978,7 @@ cglobal coeff_last64, 1,2
add eax, 32
RET
%else
-INIT_YMM avx2,lzcnt
+INIT_YMM avx2
cglobal coeff_last64, 1,3
pxor m2, m2
LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
@@ -1756,6 +1990,70 @@ cglobal coeff_last64, 1,3
RET
%endif
+%macro COEFF_LAST_AVX512 2 ; num, w/d
+cglobal coeff_last%1, 1,2
+ mova m0, [r0-(%1&1)*SIZEOF_DCTCOEF]
+ vptestm%2 k0, m0, m0
+%if %1 == 15
+ mov eax, 30
+ kmovw r1d, k0
+ lzcnt r1d, r1d
+ sub eax, r1d
+%else
+ kmovw eax, k0
+ lzcnt eax, eax
+ xor eax, 31
+%endif
+ RET
+%endmacro
+
+%macro COEFF_LAST64_AVX512 1 ; w/d
+cglobal coeff_last64, 1,2
+ pxor xm0, xm0
+ vpcmp%1 k0, m0, [r0+0*64], 4
+ vpcmp%1 k1, m0, [r0+1*64], 4
+%if HIGH_BIT_DEPTH
+ vpcmp%1 k2, m0, [r0+2*64], 4
+ vpcmp%1 k3, m0, [r0+3*64], 4
+ kunpckwd k0, k1, k0
+ kunpckwd k1, k3, k2
+%endif
+%if ARCH_X86_64
+ kunpckdq k0, k1, k0
+ kmovq rax, k0
+ lzcnt rax, rax
+ xor eax, 63
+%else
+ kmovd r1d, k1
+ kmovd eax, k0
+ lzcnt r1d, r1d
+ lzcnt eax, eax
+ xor r1d, 32
+ cmovnz eax, r1d
+ xor eax, 31
+%endif
+ RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+INIT_XMM avx512
+COEFF_LAST_AVX512 4, d
+INIT_YMM avx512
+COEFF_LAST_AVX512 8, d
+INIT_ZMM avx512
+COEFF_LAST_AVX512 15, d
+COEFF_LAST_AVX512 16, d
+COEFF_LAST64_AVX512 d
+%else ; !HIGH_BIT_DEPTH
+INIT_XMM avx512
+COEFF_LAST_AVX512 8, w
+INIT_YMM avx512
+COEFF_LAST_AVX512 15, w
+COEFF_LAST_AVX512 16, w
+INIT_ZMM avx512
+COEFF_LAST64_AVX512 w
+%endif ; !HIGH_BIT_DEPTH
+
;-----------------------------------------------------------------------------
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
@@ -1833,15 +2131,17 @@ COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
-INIT_XMM sse2, lzcnt
+INIT_MMX lzcnt
+COEFF_LEVELRUN 4
+%if HIGH_BIT_DEPTH == 0
+COEFF_LEVELRUN 8
+%endif
+INIT_XMM lzcnt
%if HIGH_BIT_DEPTH
COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
-INIT_MMX mmx2, lzcnt
-COEFF_LEVELRUN 4
-COEFF_LEVELRUN 8
; Similar to the one above, but saves the DCT
; coefficients in m0/m1 so we don't have to load
@@ -1968,7 +2268,7 @@ INIT_XMM ssse3, lzcnt
COEFF_LEVELRUN_LUT 8
COEFF_LEVELRUN_LUT 15
COEFF_LEVELRUN_LUT 16
-INIT_XMM avx2, lzcnt
+INIT_XMM avx2
COEFF_LEVELRUN_LUT 15
COEFF_LEVELRUN_LUT 16
%endif
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 9596a58..6b74aac 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -66,12 +66,15 @@ void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_avx512( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_8x8_flat16_avx512( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
@@ -85,16 +88,16 @@ void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
-int x264_decimate_score15_mmx2( dctcoef *dct );
int x264_decimate_score15_sse2( dctcoef *dct );
int x264_decimate_score15_ssse3( dctcoef *dct );
-int x264_decimate_score16_mmx2( dctcoef *dct );
+int x264_decimate_score15_avx512( dctcoef *dct );
int x264_decimate_score16_sse2( dctcoef *dct );
int x264_decimate_score16_ssse3( dctcoef *dct );
-int x264_decimate_score64_mmx2( dctcoef *dct );
+int x264_decimate_score16_avx512( dctcoef *dct );
int x264_decimate_score64_sse2( dctcoef *dct );
int x264_decimate_score64_ssse3( dctcoef *dct );
int x264_decimate_score64_avx2( int16_t *dct );
+int x264_decimate_score64_avx512( dctcoef *dct );
int x264_coeff_last4_mmx2( dctcoef *dct );
int x264_coeff_last8_mmx2( dctcoef *dct );
int x264_coeff_last15_mmx2( dctcoef *dct );
@@ -104,33 +107,37 @@ int x264_coeff_last8_sse2( dctcoef *dct );
int x264_coeff_last15_sse2( dctcoef *dct );
int x264_coeff_last16_sse2( dctcoef *dct );
int x264_coeff_last64_sse2( dctcoef *dct );
-int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct );
-int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct );
-int x264_coeff_last8_sse2_lzcnt( dctcoef *dct );
-int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
-int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
-int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
-int x264_coeff_last64_avx2_lzcnt( dctcoef *dct );
+int x264_coeff_last4_lzcnt( dctcoef *dct );
+int x264_coeff_last8_lzcnt( dctcoef *dct );
+int x264_coeff_last15_lzcnt( dctcoef *dct );
+int x264_coeff_last16_lzcnt( dctcoef *dct );
+int x264_coeff_last64_lzcnt( dctcoef *dct );
+int x264_coeff_last64_avx2 ( dctcoef *dct );
+int x264_coeff_last4_avx512( int32_t *dct );
+int x264_coeff_last8_avx512( dctcoef *dct );
+int x264_coeff_last15_avx512( dctcoef *dct );
+int x264_coeff_last16_avx512( dctcoef *dct );
+int x264_coeff_last64_avx512( dctcoef *dct );
int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_avx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_avx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
-int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index ede52ae..8029e11 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -106,8 +106,6 @@ SAD 4, 16
SAD 4, 8
SAD 4, 4
-
-
;=============================================================================
; SAD XMM
;=============================================================================
@@ -119,118 +117,64 @@ SAD 4, 4
RET
%endmacro
-%macro SAD_W16 0
;-----------------------------------------------------------------------------
; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x16, 4,4,8
- movu m0, [r2]
- movu m1, [r2+r3]
- lea r2, [r2+2*r3]
- movu m2, [r2]
- movu m3, [r2+r3]
- lea r2, [r2+2*r3]
- psadbw m0, [r0]
- psadbw m1, [r0+r1]
- lea r0, [r0+2*r1]
- movu m4, [r2]
- paddw m0, m1
- psadbw m2, [r0]
- psadbw m3, [r0+r1]
- lea r0, [r0+2*r1]
- movu m5, [r2+r3]
- lea r2, [r2+2*r3]
- paddw m2, m3
- movu m6, [r2]
- movu m7, [r2+r3]
- lea r2, [r2+2*r3]
- paddw m0, m2
- psadbw m4, [r0]
- psadbw m5, [r0+r1]
- lea r0, [r0+2*r1]
- movu m1, [r2]
- paddw m4, m5
- psadbw m6, [r0]
- psadbw m7, [r0+r1]
- lea r0, [r0+2*r1]
- movu m2, [r2+r3]
- lea r2, [r2+2*r3]
- paddw m6, m7
- movu m3, [r2]
- paddw m0, m4
- movu m4, [r2+r3]
- lea r2, [r2+2*r3]
- paddw m0, m6
- psadbw m1, [r0]
- psadbw m2, [r0+r1]
- lea r0, [r0+2*r1]
- movu m5, [r2]
- paddw m1, m2
- psadbw m3, [r0]
- psadbw m4, [r0+r1]
- lea r0, [r0+2*r1]
- movu m6, [r2+r3]
- lea r2, [r2+2*r3]
- paddw m3, m4
- movu m7, [r2]
- paddw m0, m1
- movu m1, [r2+r3]
- paddw m0, m3
- psadbw m5, [r0]
- psadbw m6, [r0+r1]
- lea r0, [r0+2*r1]
- paddw m5, m6
- psadbw m7, [r0]
- psadbw m1, [r0+r1]
- paddw m7, m1
- paddw m0, m5
- paddw m0, m7
- SAD_END_SSE2
-
-;-----------------------------------------------------------------------------
-; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
-;-----------------------------------------------------------------------------
-cglobal pixel_sad_16x8, 4,4
- movu m0, [r2]
- movu m2, [r2+r3]
- lea r2, [r2+2*r3]
- movu m3, [r2]
- movu m4, [r2+r3]
- psadbw m0, [r0]
- psadbw m2, [r0+r1]
- lea r0, [r0+2*r1]
- psadbw m3, [r0]
- psadbw m4, [r0+r1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- paddw m0, m2
- paddw m3, m4
- paddw m0, m3
- movu m1, [r2]
- movu m2, [r2+r3]
- lea r2, [r2+2*r3]
- movu m3, [r2]
- movu m4, [r2+r3]
- psadbw m1, [r0]
- psadbw m2, [r0+r1]
- lea r0, [r0+2*r1]
- psadbw m3, [r0]
- psadbw m4, [r0+r1]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- paddw m1, m2
- paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+%macro SAD_W16 1 ; h
+cglobal pixel_sad_16x%1, 4,4
+%ifidn cpuname, sse2
+.skip_prologue:
+%endif
+%assign %%i 0
+%if ARCH_X86_64
+ lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile
+ lea r5, [3*r3]
+%rep %1/4
+ movu m1, [r2]
+ psadbw m1, [r0]
+ movu m3, [r2+r3]
+ psadbw m3, [r0+r1]
+ movu m2, [r2+2*r3]
+ psadbw m2, [r0+2*r1]
+ movu m4, [r2+r5]
+ psadbw m4, [r0+r6]
+%if %%i != %1/4-1
+ lea r2, [r2+4*r3]
+ lea r0, [r0+4*r1]
+%endif
+ paddw m1, m3
+ paddw m2, m4
+ ACCUM paddw, 0, 1, %%i
+ paddw m0, m2
+ %assign %%i %%i+1
+%endrep
+%else ; The cost of having to save and restore registers on x86-32
+%rep %1/2 ; nullifies the benefit of having 3*stride in registers.
+ movu m1, [r2]
+ psadbw m1, [r0]
+ movu m2, [r2+r3]
+ psadbw m2, [r0+r1]
+%if %%i != %1/2-1
+ lea r2, [r2+2*r3]
+ lea r0, [r0+2*r1]
+%endif
+ ACCUM paddw, 0, 1, %%i
+ paddw m0, m2
+ %assign %%i %%i+1
+%endrep
+%endif
SAD_END_SSE2
%endmacro
INIT_XMM sse2
-SAD_W16
+SAD_W16 16
+SAD_W16 8
INIT_XMM sse3
-SAD_W16
+SAD_W16 16
+SAD_W16 8
INIT_XMM sse2, aligned
-SAD_W16
+SAD_W16 16
+SAD_W16 8
%macro SAD_INC_4x8P_SSE 1
movq m1, [r0]
@@ -259,7 +203,132 @@ cglobal pixel_sad_8x16_sse2, 4,4
SAD_INC_4x8P_SSE 1
SAD_INC_4x8P_SSE 1
SAD_END_SSE2
+
+%macro SAD_W48_AVX512 3 ; w, h, d/q
+cglobal pixel_sad_%1x%2, 4,4
+ kxnorb k1, k1, k1
+ kaddb k1, k1, k1
+%assign %%i 0
+%if ARCH_X86_64 && %2 != 4
+ lea r6, [3*r1]
+ lea r5, [3*r3]
+%rep %2/4
+ mov%3 m1, [r0]
+ vpbroadcast%3 m1 {k1}, [r0+r1]
+ mov%3 m3, [r2]
+ vpbroadcast%3 m3 {k1}, [r2+r3]
+ mov%3 m2, [r0+2*r1]
+ vpbroadcast%3 m2 {k1}, [r0+r6]
+ mov%3 m4, [r2+2*r3]
+ vpbroadcast%3 m4 {k1}, [r2+r5]
+%if %%i != %2/4-1
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+%endif
+ psadbw m1, m3
+ psadbw m2, m4
+ ACCUM paddd, 0, 1, %%i
+ paddd m0, m2
+ %assign %%i %%i+1
+%endrep
+%else
+%rep %2/2
+ mov%3 m1, [r0]
+ vpbroadcast%3 m1 {k1}, [r0+r1]
+ mov%3 m2, [r2]
+ vpbroadcast%3 m2 {k1}, [r2+r3]
+%if %%i != %2/2-1
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+%endif
+ psadbw m1, m2
+ ACCUM paddd, 0, 1, %%i
+ %assign %%i %%i+1
+%endrep
+%endif
+%if %1 == 8
+ punpckhqdq m1, m0, m0
+ paddd m0, m1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM avx512
+SAD_W48_AVX512 4, 4, d
+SAD_W48_AVX512 4, 8, d
+SAD_W48_AVX512 4, 16, d
+SAD_W48_AVX512 8, 4, q
+SAD_W48_AVX512 8, 8, q
+SAD_W48_AVX512 8, 16, q
+
+%macro SAD_W16_AVX512_START 1 ; h
+ cmp r1d, FENC_STRIDE ; optimized for the most common fenc case, which
+ jne pixel_sad_16x%1_sse2.skip_prologue ; has the rows laid out contiguously in memory
+ lea r1, [3*r3]
+%endmacro
+
+%macro SAD_W16_AVX512_END 0
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+%if mmsize == 64
+ vextracti32x8 ym1, m0, 1
+ paddd ym0, ym1
+%endif
+ vextracti128 xm1, ym0, 1
+ paddd xmm0, xm0, xm1
+ punpckhqdq xmm1, xmm0, xmm0
+ paddd xmm0, xmm1
+ movd eax, xmm0
RET
+%endmacro
+
+INIT_YMM avx512
+cglobal pixel_sad_16x8, 4,4
+ SAD_W16_AVX512_START 8
+ movu xm0, [r2]
+ vinserti128 m0, [r2+r3], 1
+ psadbw m0, [r0+0*32]
+ movu xm1, [r2+2*r3]
+ vinserti128 m1, [r2+r1], 1
+ lea r2, [r2+4*r3]
+ psadbw m1, [r0+1*32]
+ movu xm2, [r2]
+ vinserti128 m2, [r2+r3], 1
+ psadbw m2, [r0+2*32]
+ movu xm3, [r2+2*r3]
+ vinserti128 m3, [r2+r1], 1
+ psadbw m3, [r0+3*32]
+ SAD_W16_AVX512_END
+
+INIT_ZMM avx512
+cglobal pixel_sad_16x16, 4,4
+ SAD_W16_AVX512_START 16
+ movu xm0, [r2]
+ vinserti128 ym0, [r2+r3], 1
+ movu xm1, [r2+4*r3]
+ vinserti32x4 m0, [r2+2*r3], 2
+ vinserti32x4 m1, [r2+2*r1], 2
+ vinserti32x4 m0, [r2+r1], 3
+ lea r2, [r2+4*r3]
+ vinserti32x4 m1, [r2+r3], 1
+ psadbw m0, [r0+0*64]
+ vinserti32x4 m1, [r2+r1], 3
+ lea r2, [r2+4*r3]
+ psadbw m1, [r0+1*64]
+ movu xm2, [r2]
+ vinserti128 ym2, [r2+r3], 1
+ movu xm3, [r2+4*r3]
+ vinserti32x4 m2, [r2+2*r3], 2
+ vinserti32x4 m3, [r2+2*r1], 2
+ vinserti32x4 m2, [r2+r1], 3
+ lea r2, [r2+4*r3]
+ vinserti32x4 m3, [r2+r3], 1
+ psadbw m2, [r0+2*64]
+ vinserti32x4 m3, [r2+r1], 3
+ psadbw m3, [r0+3*64]
+ SAD_W16_AVX512_END
;-----------------------------------------------------------------------------
; void pixel_vsad( pixel *src, intptr_t stride );
@@ -1548,6 +1617,225 @@ SAD_X_AVX2 3, 16, 8, 7
SAD_X_AVX2 4, 16, 16, 8
SAD_X_AVX2 4, 16, 8, 8
+%macro SAD_X_W4_AVX512 2 ; x, h
+cglobal pixel_sad_x%1_4x%2, %1+2,%1+3
+ mov t1d, 0xa
+ kmovb k1, t1d
+ lea t1, [3*t0]
+ kaddb k2, k1, k1
+ kshiftlb k3, k1, 2
+%assign %%i 0
+%rep %2/4
+ movu m6, [r0+%%i*64]
+ vmovddup m6 {k1}, [r0+%%i*64+32]
+ movd xmm2, [r1]
+ movd xmm4, [r1+t0]
+ vpbroadcastd xmm2 {k1}, [r1+2*t0]
+ vpbroadcastd xmm4 {k1}, [r1+t1]
+ vpbroadcastd xmm2 {k2}, [r2+t0]
+ vpbroadcastd xmm4 {k2}, [r2]
+ vpbroadcastd xmm2 {k3}, [r2+t1] ; a0 a2 b1 b3
+ vpbroadcastd xmm4 {k3}, [r2+2*t0] ; a1 a3 b0 b2
+ vpmovqd s1, m6 ; s0 s2 s1 s3
+ movd xmm3, [r3]
+ movd xmm5, [r3+t0]
+ vpbroadcastd xmm3 {k1}, [r3+2*t0]
+ vpbroadcastd xmm5 {k1}, [r3+t1]
+%if %1 == 4
+ vpbroadcastd xmm3 {k2}, [r4+t0]
+ vpbroadcastd xmm5 {k2}, [r4]
+ vpbroadcastd xmm3 {k3}, [r4+t1] ; c0 c2 d1 d3
+ vpbroadcastd xmm5 {k3}, [r4+2*t0] ; c1 c3 d0 d2
+%endif
+%if %%i != %2/4-1
+%assign %%j 1
+%rep %1
+ lea r%+%%j, [r%+%%j+4*t0]
+ %assign %%j %%j+1
+%endrep
+%endif
+ pshufd s2, s1, q1032
+ psadbw xmm2, s1
+ psadbw xmm4, s2
+ psadbw xmm3, s1
+ psadbw xmm5, s2
+%if %%i
+ paddd xmm0, xmm2
+ paddd xmm1, xmm3
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+%else
+ paddd xmm0, xmm2, xmm4
+ paddd xmm1, xmm3, xmm5
+%endif
+ %assign %%i %%i+1
+%endrep
+%if %1 == 4
+ movifnidn t2, r6mp
+%else
+ movifnidn t2, r5mp
+%endif
+ packusdw xmm0, xmm1
+ mova [t2], xmm0
+ RET
+%endmacro
+
+%macro SAD_X_W8_AVX512 2 ; x, h
+cglobal pixel_sad_x%1_8x%2, %1+2,%1+3
+ kxnorb k3, k3, k3
+ lea t1, [3*t0]
+ kaddb k1, k3, k3
+ kshiftlb k2, k3, 2
+ kshiftlb k3, k3, 3
+%assign %%i 0
+%rep %2/4
+ movddup m6, [r0+%%i*64] ; s0 s0 s1 s1
+ movq xm2, [r1]
+ movq xm4, [r1+2*t0]
+ vpbroadcastq xm2 {k1}, [r2]
+ vpbroadcastq xm4 {k1}, [r2+2*t0]
+ vpbroadcastq m2 {k2}, [r1+t0]
+ vpbroadcastq m4 {k2}, [r1+t1]
+ vpbroadcastq m2 {k3}, [r2+t0] ; a0 b0 a1 b1
+ vpbroadcastq m4 {k3}, [r2+t1] ; a2 b2 a3 b3
+ movddup m7, [r0+%%i*64+32] ; s2 s2 s3 s3
+ movq xm3, [r3]
+ movq xm5, [r3+2*t0]
+%if %1 == 4
+ vpbroadcastq xm3 {k1}, [r4]
+ vpbroadcastq xm5 {k1}, [r4+2*t0]
+%endif
+ vpbroadcastq m3 {k2}, [r3+t0]
+ vpbroadcastq m5 {k2}, [r3+t1]
+%if %1 == 4
+ vpbroadcastq m3 {k3}, [r4+t0] ; c0 d0 c1 d1
+ vpbroadcastq m5 {k3}, [r4+t1] ; c2 d2 c3 d3
+%endif
+%if %%i != %2/4-1
+%assign %%j 1
+%rep %1
+ lea r%+%%j, [r%+%%j+4*t0]
+ %assign %%j %%j+1
+%endrep
+%endif
+ psadbw m2, m6
+ psadbw m4, m7
+ psadbw m3, m6
+ psadbw m5, m7
+ ACCUM paddd, 0, 2, %%i
+ ACCUM paddd, 1, 3, %%i
+ paddd m0, m4
+ paddd m1, m5
+ %assign %%i %%i+1
+%endrep
+%if %1 == 4
+ movifnidn t2, r6mp
+%else
+ movifnidn t2, r5mp
+%endif
+ packusdw m0, m1
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1
+ mova [t2], xm0
+ RET
+%endmacro
+
+%macro SAD_X_W16_AVX512 2 ; x, h
+cglobal pixel_sad_x%1_16x%2, %1+2,%1+3
+ lea t1, [3*t0]
+%assign %%i 0
+%rep %2/4
+ mova m6, [r0+%%i*64] ; s0 s1 s2 s3
+ movu xm2, [r3]
+ movu xm4, [r3+t0]
+%if %1 == 4
+ vinserti128 ym2, [r4+t0], 1
+ vinserti128 ym4, [r4], 1
+%endif
+ vinserti32x4 m2, [r1+2*t0], 2
+ vinserti32x4 m4, [r1+t1], 2
+ vinserti32x4 m2, [r2+t1], 3 ; c0 d1 a2 b3
+ vinserti32x4 m4, [r2+2*t0], 3 ; c1 d0 a3 b2
+ vpermq m7, m6, q1032 ; s1 s0 s3 s2
+ movu xm3, [r1]
+ movu xm5, [r1+t0]
+ vinserti128 ym3, [r2+t0], 1
+ vinserti128 ym5, [r2], 1
+ vinserti32x4 m3, [r3+2*t0], 2
+ vinserti32x4 m5, [r3+t1], 2
+%if %1 == 4
+ vinserti32x4 m3, [r4+t1], 3 ; a0 b1 c2 d3
+ vinserti32x4 m5, [r4+2*t0], 3 ; a1 b0 c3 d2
+%endif
+%if %%i != %2/4-1
+%assign %%j 1
+%rep %1
+ lea r%+%%j, [r%+%%j+4*t0]
+ %assign %%j %%j+1
+%endrep
+%endif
+ psadbw m2, m6
+ psadbw m4, m7
+ psadbw m3, m6
+ psadbw m5, m7
+ ACCUM paddd, 0, 2, %%i
+ ACCUM paddd, 1, 3, %%i
+ paddd m0, m4
+ paddd m1, m5
+ %assign %%i %%i+1
+%endrep
+%if %1 == 4
+ movifnidn t2, r6mp
+%else
+ movifnidn t2, r5mp
+%endif
+ mov t1d, 0x1111
+ kmovw k1, t1d
+ vshufi32x4 m0, m0, q1032
+ paddd m0, m1
+ punpckhqdq m1, m0, m0
+ paddd m0, m1
+ vpcompressd m0 {k1}{z}, m0
+ mova [t2], xm0
+ RET
+%endmacro
+
+; t0 = stride, t1 = tmp/stride3, t2 = scores
+%if WIN64
+ %define s1 xmm16 ; xmm6 and xmm7 reduces code size, but
+ %define s2 xmm17 ; they're callee-saved on win64
+ DECLARE_REG_TMP 4, 6, 0
+%else
+ %define s1 xmm6
+ %define s2 xmm7
+%if ARCH_X86_64
+ DECLARE_REG_TMP 4, 6, 5 ; scores is passed in a register on unix64
+%else
+ DECLARE_REG_TMP 4, 5, 0
+%endif
+%endif
+
+INIT_YMM avx512
+SAD_X_W4_AVX512 3, 4 ; x3_4x4
+SAD_X_W4_AVX512 3, 8 ; x3_4x8
+SAD_X_W8_AVX512 3, 4 ; x3_8x4
+SAD_X_W8_AVX512 3, 8 ; x3_8x8
+SAD_X_W8_AVX512 3, 16 ; x3_8x16
+INIT_ZMM avx512
+SAD_X_W16_AVX512 3, 8 ; x3_16x8
+SAD_X_W16_AVX512 3, 16 ; x3_16x16
+
+DECLARE_REG_TMP 5, 6, 0
+INIT_YMM avx512
+SAD_X_W4_AVX512 4, 4 ; x4_4x4
+SAD_X_W4_AVX512 4, 8 ; x4_4x8
+SAD_X_W8_AVX512 4, 4 ; x4_8x4
+SAD_X_W8_AVX512 4, 8 ; x4_8x8
+SAD_X_W8_AVX512 4, 16 ; x4_8x16
+INIT_ZMM avx512
+SAD_X_W16_AVX512 4, 8 ; x4_16x8
+SAD_X_W16_AVX512 4, 16 ; x4_16x16
+
;=============================================================================
; SAD cacheline split
;=============================================================================
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index ff8b500..3be387d 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -323,6 +323,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%endmacro
%define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
@@ -436,15 +438,16 @@ DECLARE_REG 14, R13, 120
%macro WIN64_PUSH_XMM 0
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
- %if xmm_regs_used > 6
+ %if xmm_regs_used > 6 + high_mm_regs
movaps [rstk + stack_offset + 8], xmm6
%endif
- %if xmm_regs_used > 7
+ %if xmm_regs_used > 7 + high_mm_regs
movaps [rstk + stack_offset + 24], xmm7
%endif
- %if xmm_regs_used > 8
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
%assign %%i 8
- %rep xmm_regs_used-8
+ %rep %%xmm_regs_on_stack
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
%assign %%i %%i+1
%endrep
@@ -453,10 +456,11 @@ DECLARE_REG 14, R13, 120
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
- ASSERT xmm_regs_used <= 16
- %if xmm_regs_used > 8
+ ASSERT xmm_regs_used <= 16 + high_mm_regs
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
- %assign %%pad (xmm_regs_used-8)*16 + 32
+ %assign %%pad %%xmm_regs_on_stack*16 + 32
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
SUB rsp, stack_size_padded
%endif
@@ -465,9 +469,10 @@ DECLARE_REG 14, R13, 120
%macro WIN64_RESTORE_XMM_INTERNAL 0
%assign %%pad_size 0
- %if xmm_regs_used > 8
- %assign %%i xmm_regs_used
- %rep xmm_regs_used-8
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ %assign %%i xmm_regs_used - high_mm_regs
+ %rep %%xmm_regs_on_stack
%assign %%i %%i-1
movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
%endrep
@@ -480,10 +485,10 @@ DECLARE_REG 14, R13, 120
%assign %%pad_size stack_size_padded
%endif
%endif
- %if xmm_regs_used > 7
+ %if xmm_regs_used > 7 + high_mm_regs
movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
%endif
- %if xmm_regs_used > 6
+ %if xmm_regs_used > 6 + high_mm_regs
movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
%endif
%endmacro
@@ -495,12 +500,12 @@ DECLARE_REG 14, R13, 120
%assign xmm_regs_used 0
%endmacro
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@@ -524,9 +529,10 @@ DECLARE_REG 12, R15, 56
DECLARE_REG 13, R12, 64
DECLARE_REG 14, R13, 72
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
+ %assign xmm_regs_used %3
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
@@ -536,7 +542,7 @@ DECLARE_REG 14, R13, 72
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
@@ -547,7 +553,7 @@ DECLARE_REG 14, R13, 72
%endif
%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@@ -592,7 +598,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
@@ -603,7 +609,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%endif
%endif
POP_IF_USED 6, 5, 4, 3
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@@ -713,7 +719,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
- %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
%ifnidn %3, ""
PROLOGUE %3
%endif
@@ -776,24 +782,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign cpuflags_sse (1<<4) | cpuflags_mmx2
%assign cpuflags_sse2 (1<<5) | cpuflags_sse
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
-%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
-%assign cpuflags_avx (1<<11)| cpuflags_sse42
-%assign cpuflags_xop (1<<12)| cpuflags_avx
-%assign cpuflags_fma4 (1<<13)| cpuflags_avx
-%assign cpuflags_fma3 (1<<14)| cpuflags_avx
-%assign cpuflags_avx2 (1<<15)| cpuflags_fma3
-
-%assign cpuflags_cache32 (1<<16)
-%assign cpuflags_cache64 (1<<17)
-%assign cpuflags_slowctz (1<<18)
-%assign cpuflags_lzcnt (1<<19)
-%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<21)
-%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
+%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
+%assign cpuflags_sse42 (1<<11)| cpuflags_sse4
+%assign cpuflags_aesni (1<<12)| cpuflags_sse42
+%assign cpuflags_avx (1<<13)| cpuflags_sse42
+%assign cpuflags_xop (1<<14)| cpuflags_avx
+%assign cpuflags_fma4 (1<<15)| cpuflags_avx
+%assign cpuflags_fma3 (1<<16)| cpuflags_avx
+%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1
+%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
+
+%assign cpuflags_cache32 (1<<21)
+%assign cpuflags_cache64 (1<<22)
+%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<24)
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -836,7 +843,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%if ARCH_X86_64 || cpuflag(sse2)
%ifdef __NASM_VER__
- ALIGNMODE k8
+ ALIGNMODE p6
%else
CPU amdnop
%endif
@@ -849,11 +856,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endif
%endmacro
-; Merge mmx and sse*
+; Merge mmx, sse*, and avx*
; m# is a simd register of the currently selected size
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
%macro CAT_XDEFINE 3
%xdefine %1%2 %3
@@ -863,6 +871,18 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%undef %1%2
%endmacro
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+ %if ARCH_X86_64 && cpuflag(avx512)
+ %assign %%i %1
+ %rep 16-%1
+ %assign %%i_high %%i+16
+ SWAP %%i, %%i_high
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
%macro INIT_MMX 0-1+
%assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_MMX %1
@@ -878,7 +898,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
CAT_XDEFINE nnmm, %%i, %%i
%assign %%i %%i+1
%endrep
- %rep 8
+ %rep 24
CAT_UNDEF m, %%i
CAT_UNDEF nnmm, %%i
%assign %%i %%i+1
@@ -892,7 +912,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%define mmsize 16
%define num_mmregs 8
%if ARCH_X86_64
- %define num_mmregs 16
+ %define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
@@ -905,6 +925,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
+ %if WIN64
+ ; Swap callee-saved registers with volatile registers
+ AVX512_MM_PERMUTATION 6
+ %endif
%endmacro
%macro INIT_YMM 0-1+
@@ -913,7 +937,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%define mmsize 32
%define num_mmregs 8
%if ARCH_X86_64
- %define num_mmregs 16
+ %define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
@@ -926,6 +950,29 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
+ AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_ZMM %1
+ %define mmsize 64
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 32
+ %endif
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, zmm %+ %%i
+ CAT_XDEFINE nnzmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+ AVX512_MM_PERMUTATION
%endmacro
INIT_XMM
@@ -934,18 +981,26 @@ INIT_XMM
%define mmmm%1 mm%1
%define mmxmm%1 mm%1
%define mmymm%1 mm%1
+ %define mmzmm%1 mm%1
%define xmmmm%1 mm%1
%define xmmxmm%1 xmm%1
%define xmmymm%1 xmm%1
+ %define xmmzmm%1 xmm%1
%define ymmmm%1 mm%1
%define ymmxmm%1 xmm%1
%define ymmymm%1 ymm%1
+ %define ymmzmm%1 ymm%1
+ %define zmmmm%1 mm%1
+ %define zmmxmm%1 xmm%1
+ %define zmmymm%1 ymm%1
+ %define zmmzmm%1 zmm%1
%define xm%1 xmm %+ m%1
%define ym%1 ymm %+ m%1
+ %define zm%1 zmm %+ m%1
%endmacro
%assign i 0
-%rep 16
+%rep 32
DECLARE_MMCAST i
%assign i i+1
%endrep
@@ -1080,12 +1135,17 @@ INIT_XMM
;=============================================================================
%assign i 0
-%rep 16
+%rep 32
%if i < 8
CAT_XDEFINE sizeofmm, i, 8
+ CAT_XDEFINE regnumofmm, i, i
%endif
CAT_XDEFINE sizeofxmm, i, 16
CAT_XDEFINE sizeofymm, i, 32
+ CAT_XDEFINE sizeofzmm, i, 64
+ CAT_XDEFINE regnumofxmm, i, i
+ CAT_XDEFINE regnumofymm, i, i
+ CAT_XDEFINE regnumofzmm, i, i
%assign i i+1
%endrep
%undef i
@@ -1202,7 +1262,7 @@ INIT_XMM
%endmacro
%endmacro
-; Instructions with both VEX and non-VEX encodings
+; Instructions with both VEX/EVEX and legacy encodings
; Non-destructive instructions are written without parameters
AVX_INSTR addpd, sse2, 1, 0, 1
AVX_INSTR addps, sse, 1, 0, 1
@@ -1534,15 +1594,48 @@ FMA4_INSTR fmsubadd, pd, ps
FMA4_INSTR fnmadd, pd, ps, sd, ss
FMA4_INSTR fnmsub, pd, ps, sd, ss
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
-%ifdef __YASM_VER__
- %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
- %macro vpbroadcastq 2
- %if sizeof%1 == 16
- movddup %1, %2
- %else
- vbroadcastsd %1, %2
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+ %macro %1 2-7 fnord, fnord, %1, %2, %3
+ %ifidn %3, fnord
+ %define %%args %1, %2
+ %elifidn %4, fnord
+ %define %%args %1, %2, %3
+ %else
+ %define %%args %1, %2, %3, %4
+ %endif
+ %assign %%evex_required cpuflag(avx512) & %7
+ %ifnum regnumof%1
+ %if regnumof%1 >= 16 || sizeof%1 > 32
+ %assign %%evex_required 1
%endif
- %endmacro
- %endif
-%endif
+ %endif
+ %ifnum regnumof%2
+ %if regnumof%2 >= 16 || sizeof%2 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %if %%evex_required
+ %6 %%args
+ %else
+ %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+ %endif
+ %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128, vextractf32x4
+EVEX_INSTR vextracti128, vextracti32x4
+EVEX_INSTR vinsertf128, vinsertf32x4
+EVEX_INSTR vinserti128, vinserti32x4
+EVEX_INSTR vmovdqa, vmovdqa32
+EVEX_INSTR vmovdqu, vmovdqu32
+EVEX_INSTR vpand, vpandd
+EVEX_INSTR vpandn, vpandnd
+EVEX_INSTR vpor, vpord
+EVEX_INSTR vpxor, vpxord
+EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss, vrcp14ss, 1
+EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index ea40bc8..7a140eb 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -303,24 +303,24 @@
%endmacro
%macro HADDD 2 ; sum junk
-%if sizeof%1 == 32
-%define %2 xmm%2
- vextracti128 %2, %1, 1
-%define %1 xmm%1
- paddd %1, %2
+%if sizeof%1 >= 64
+ vextracti32x8 ymm%2, zmm%1, 1
+ paddd ymm%1, ymm%2
%endif
-%if mmsize >= 16
- MOVHL %2, %1
- paddd %1, %2
+%if sizeof%1 >= 32
+ vextracti128 xmm%2, ymm%1, 1
+ paddd xmm%1, xmm%2
+%endif
+%if sizeof%1 >= 16
+ MOVHL xmm%2, xmm%1
+ paddd xmm%1, xmm%2
%endif
%if cpuflag(xop) && sizeof%1 == 16
- vphadddq %1, %1
+ vphadddq xmm%1, xmm%1
%else
- PSHUFLW %2, %1, q0032
- paddd %1, %2
+ PSHUFLW xmm%2, xmm%1, q1032
+ paddd xmm%1, xmm%2
%endif
-%undef %1
-%undef %2
%endmacro
%macro HADDW 2 ; reg, tmp
diff --git a/config.guess b/config.guess
index 2e9ad7f..31e01ef 100755
--- a/config.guess
+++ b/config.guess
@@ -1,8 +1,8 @@
#! /bin/sh
# Attempt to guess a canonical system name.
-# Copyright 1992-2016 Free Software Foundation, Inc.
+# Copyright 1992-2017 Free Software Foundation, Inc.
-timestamp='2016-10-02'
+timestamp='2017-11-07'
# This file is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
@@ -15,7 +15,7 @@ timestamp='2016-10-02'
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
+# along with this program; if not, see <https://www.gnu.org/licenses/>.
#
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
@@ -27,7 +27,7 @@ timestamp='2016-10-02'
# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
#
# You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
+# https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
#
# Please send patches to <config-patches at gnu.org>.
@@ -39,7 +39,7 @@ Usage: $0 [OPTION]
Output the configuration name of the system \`$me' is run on.
-Operation modes:
+Options:
-h, --help print this help, then exit
-t, --time-stamp print date of last modification, then exit
-v, --version print version number, then exit
@@ -50,7 +50,7 @@ version="\
GNU config.guess ($timestamp)
Originally written by Per Bothner.
-Copyright 1992-2016 Free Software Foundation, Inc.
+Copyright 1992-2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -244,6 +244,9 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE}
exit ;;
+ *:MidnightBSD:*:*)
+ echo ${UNAME_MACHINE}-unknown-midnightbsd${UNAME_RELEASE}
+ exit ;;
*:ekkoBSD:*:*)
echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
exit ;;
@@ -259,6 +262,9 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
*:Sortix:*:*)
echo ${UNAME_MACHINE}-unknown-sortix
exit ;;
+ *:Redox:*:*)
+ echo ${UNAME_MACHINE}-unknown-redox
+ exit ;;
alpha:OSF1:*:*)
case $UNAME_RELEASE in
*4.0)
@@ -315,15 +321,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
exitcode=$?
trap '' 0
exit $exitcode ;;
- Alpha\ *:Windows_NT*:*)
- # How do we know it's Interix rather than the generic POSIX subsystem?
- # Should we change UNAME_MACHINE based on the output of uname instead
- # of the specific Alpha model?
- echo alpha-pc-interix
- exit ;;
- 21064:Windows_NT:50:3)
- echo alpha-dec-winnt3.5
- exit ;;
Amiga*:UNIX_System_V:4.0:*)
echo m68k-unknown-sysv4
exit ;;
@@ -485,13 +482,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
#endif
#if defined (host_mips) && defined (MIPSEB)
#if defined (SYSTYPE_SYSV)
- printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+ printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0);
#endif
#if defined (SYSTYPE_SVR4)
- printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+ printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0);
#endif
#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
- printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+ printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0);
#endif
#endif
exit (-1);
@@ -614,7 +611,7 @@ EOF
*:AIX:*:*)
echo rs6000-ibm-aix
exit ;;
- ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+ ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*)
echo romp-ibm-bsd4.4
exit ;;
ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and
@@ -635,8 +632,8 @@ EOF
9000/[34678]??:HP-UX:*:*)
HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
case "${UNAME_MACHINE}" in
- 9000/31? ) HP_ARCH=m68000 ;;
- 9000/[34]?? ) HP_ARCH=m68k ;;
+ 9000/31?) HP_ARCH=m68000 ;;
+ 9000/[34]??) HP_ARCH=m68k ;;
9000/[678][0-9][0-9])
if [ -x /usr/bin/getconf ]; then
sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
@@ -749,7 +746,7 @@ EOF
{ echo "$SYSTEM_NAME"; exit; }
echo unknown-hitachi-hiuxwe2
exit ;;
- 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+ 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*)
echo hppa1.1-hp-bsd
exit ;;
9000/8??:4.3bsd:*:*)
@@ -758,7 +755,7 @@ EOF
*9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
echo hppa1.0-hp-mpeix
exit ;;
- hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+ hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*)
echo hppa1.1-hp-osf
exit ;;
hp8??:OSF1:*:*)
@@ -837,10 +834,11 @@ EOF
UNAME_PROCESSOR=`/usr/bin/uname -p`
case ${UNAME_PROCESSOR} in
amd64)
- echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
- *)
- echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+ UNAME_PROCESSOR=x86_64 ;;
+ i386)
+ UNAME_PROCESSOR=i586 ;;
esac
+ echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
exit ;;
i*:CYGWIN*:*)
echo ${UNAME_MACHINE}-pc-cygwin
@@ -854,10 +852,6 @@ EOF
*:MSYS*:*)
echo ${UNAME_MACHINE}-pc-msys
exit ;;
- i*:windows32*:*)
- # uname -m includes "-pc" on this system.
- echo ${UNAME_MACHINE}-mingw32
- exit ;;
i*:PW*:*)
echo ${UNAME_MACHINE}-pc-pw32
exit ;;
@@ -873,27 +867,12 @@ EOF
echo ia64-unknown-interix${UNAME_RELEASE}
exit ;;
esac ;;
- [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
- echo i${UNAME_MACHINE}-pc-mks
- exit ;;
- 8664:Windows_NT:*)
- echo x86_64-pc-mks
- exit ;;
- i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
- # How do we know it's Interix rather than the generic POSIX subsystem?
- # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
- # UNAME_MACHINE based on the output of uname instead of i386?
- echo i586-pc-interix
- exit ;;
i*:UWIN*:*)
echo ${UNAME_MACHINE}-pc-uwin
exit ;;
amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
echo x86_64-unknown-cygwin
exit ;;
- p*:CYGWIN*:*)
- echo powerpcle-unknown-cygwin
- exit ;;
prep*:SunOS:5.*:*)
echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
exit ;;
@@ -1096,7 +1075,7 @@ EOF
i*86:*DOS:*:*)
echo ${UNAME_MACHINE}-pc-msdosdjgpp
exit ;;
- i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+ i*86:*:4.*:*)
UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
@@ -1303,14 +1282,21 @@ EOF
if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
- (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
- grep IS_64BIT_ARCH >/dev/null
+ (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+ grep IS_64BIT_ARCH >/dev/null
then
case $UNAME_PROCESSOR in
i386) UNAME_PROCESSOR=x86_64 ;;
powerpc) UNAME_PROCESSOR=powerpc64 ;;
esac
fi
+ # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc
+ if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \
+ (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+ grep IS_PPC >/dev/null
+ then
+ UNAME_PROCESSOR=powerpc
+ fi
fi
elif test "$UNAME_PROCESSOR" = i386 ; then
# Avoid executing cc on OS X 10.9, as it ships with a stub
@@ -1334,15 +1320,18 @@ EOF
*:QNX:*:4*)
echo i386-pc-qnx
exit ;;
- NEO-?:NONSTOP_KERNEL:*:*)
+ NEO-*:NONSTOP_KERNEL:*:*)
echo neo-tandem-nsk${UNAME_RELEASE}
exit ;;
NSE-*:NONSTOP_KERNEL:*:*)
echo nse-tandem-nsk${UNAME_RELEASE}
exit ;;
- NSR-?:NONSTOP_KERNEL:*:*)
+ NSR-*:NONSTOP_KERNEL:*:*)
echo nsr-tandem-nsk${UNAME_RELEASE}
exit ;;
+ NSX-*:NONSTOP_KERNEL:*:*)
+ echo nsx-tandem-nsk${UNAME_RELEASE}
+ exit ;;
*:NonStop-UX:*:*)
echo mips-compaq-nonstopux
exit ;;
@@ -1414,16 +1403,28 @@ EOF
exit ;;
esac
+echo "$0: unable to guess system type" >&2
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}" in
+ mips:Linux | mips64:Linux)
+ # If we got here on MIPS GNU/Linux, output extra information.
+ cat >&2 <<EOF
+
+NOTE: MIPS GNU/Linux systems require a C compiler to fully recognize
+the system type. Please install a C compiler and try again.
+EOF
+ ;;
+esac
+
cat >&2 <<EOF
-$0: unable to guess system type
This script (version $timestamp), has failed to recognize the
-operating system you are using. If your script is old, overwrite
-config.guess and config.sub with the latest versions from:
+operating system you are using. If your script is old, overwrite *all*
+copies of config.guess and config.sub with the latest versions from:
- http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
+ https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
and
- http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+ https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
If $0 has already been updated, send the following data and any
information you think might be pertinent to config-patches at gnu.org to
@@ -1455,7 +1456,7 @@ EOF
exit 1
# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'write-file-functions 'time-stamp)
# time-stamp-start: "timestamp='"
# time-stamp-format: "%:y-%02m-%02d"
# time-stamp-end: "'"
diff --git a/config.sub b/config.sub
index dd2ca93..00f68b8 100755
--- a/config.sub
+++ b/config.sub
@@ -1,8 +1,8 @@
#! /bin/sh
# Configuration validation subroutine script.
-# Copyright 1992-2016 Free Software Foundation, Inc.
+# Copyright 1992-2017 Free Software Foundation, Inc.
-timestamp='2016-11-04'
+timestamp='2017-11-23'
# This file is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
@@ -15,7 +15,7 @@ timestamp='2016-11-04'
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
+# along with this program; if not, see <https://www.gnu.org/licenses/>.
#
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
@@ -33,7 +33,7 @@ timestamp='2016-11-04'
# Otherwise, we print the canonical config type on stdout and succeed.
# You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+# https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
# This file is supposed to be the same for all GNU packages
# and recognize all the CPU types, system types and aliases
@@ -57,7 +57,7 @@ Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
Canonicalize a configuration name.
-Operation modes:
+Options:
-h, --help print this help, then exit
-t, --time-stamp print date of last modification, then exit
-v, --version print version number, then exit
@@ -67,7 +67,7 @@ Report bugs and patches to <config-patches at gnu.org>."
version="\
GNU config.sub ($timestamp)
-Copyright 1992-2016 Free Software Foundation, Inc.
+Copyright 1992-2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -229,9 +229,6 @@ case $os in
-ptx*)
basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
;;
- -windowsnt*)
- os=`echo $os | sed -e 's/windowsnt/winnt/'`
- ;;
-psos*)
os=-psos
;;
@@ -263,7 +260,7 @@ case $basic_machine in
| fido | fr30 | frv | ft32 \
| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
| hexagon \
- | i370 | i860 | i960 | ia64 \
+ | i370 | i860 | i960 | ia16 | ia64 \
| ip2k | iq2000 \
| k1om \
| le32 | le64 \
@@ -315,7 +312,7 @@ case $basic_machine in
| ubicom32 \
| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
| visium \
- | we32k \
+ | wasm32 \
| x86 | xc16x | xstormy16 | xtensa \
| z8k | z80)
basic_machine=$basic_machine-unknown
@@ -388,7 +385,7 @@ case $basic_machine in
| h8300-* | h8500-* \
| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
| hexagon-* \
- | i*86-* | i860-* | i960-* | ia64-* \
+ | i*86-* | i860-* | i960-* | ia16-* | ia64-* \
| ip2k-* | iq2000-* \
| k1om-* \
| le32-* | le64-* \
@@ -446,6 +443,7 @@ case $basic_machine in
| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
| vax-* \
| visium-* \
+ | wasm32-* \
| we32k-* \
| x86-* | x86_64-* | xc16x-* | xps100-* \
| xstormy16-* | xtensa*-* \
@@ -641,7 +639,7 @@ case $basic_machine in
basic_machine=rs6000-bull
os=-bosx
;;
- dpx2* | dpx2*-bull)
+ dpx2*)
basic_machine=m68k-bull
os=-sysv3
;;
@@ -903,7 +901,7 @@ case $basic_machine in
basic_machine=v70-nec
os=-sysv
;;
- next | m*-next )
+ next | m*-next)
basic_machine=m68k-next
case $os in
-nextstep* )
@@ -948,6 +946,9 @@ case $basic_machine in
nsr-tandem)
basic_machine=nsr-tandem
;;
+ nsx-tandem)
+ basic_machine=nsx-tandem
+ ;;
op50n-* | op60c-*)
basic_machine=hppa1.1-oki
os=-proelf
@@ -1243,6 +1244,9 @@ case $basic_machine in
basic_machine=a29k-wrs
os=-vxworks
;;
+ wasm32)
+ basic_machine=wasm32-unknown
+ ;;
w65*)
basic_machine=w65-wdc
os=-none
@@ -1251,6 +1255,9 @@ case $basic_machine in
basic_machine=hppa1.1-winbond
os=-proelf
;;
+ x64)
+ basic_machine=x86_64-pc
+ ;;
xbox)
basic_machine=i686-pc
os=-mingw32
@@ -1358,8 +1365,8 @@ esac
if [ x"$os" != x"" ]
then
case $os in
- # First match some system type aliases
- # that might get confused with valid system types.
+ # First match some system type aliases that might get confused
+ # with valid system types.
# -solaris* is a basic system type, with this one exception.
-auroraux)
os=-auroraux
@@ -1379,9 +1386,9 @@ case $os in
-gnu/linux*)
os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
;;
- # First accept the basic system types.
+ # Now accept the basic system types.
# The portable systems comes first.
- # Each alternative MUST END IN A *, to match a version number.
+ # Each alternative MUST end in a * to match a version number.
# -sysv* is not here because it comes later, after sysvr4.
-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
| -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
@@ -1397,7 +1404,7 @@ case $os in
| -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
| -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
| -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
- | -chorusos* | -chorusrdb* | -cegcc* \
+ | -chorusos* | -chorusrdb* | -cegcc* | -glidix* \
| -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
| -midipix* | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
| -linux-newlib* | -linux-musl* | -linux-uclibc* \
@@ -1409,7 +1416,7 @@ case $os in
| -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
| -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
| -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \
- | -onefs* | -tirtos* | -phoenix* | -fuchsia*)
+ | -onefs* | -tirtos* | -phoenix* | -fuchsia* | -redox*)
# Remember, each alternative MUST END IN *, to match a version number.
;;
-qnx*)
@@ -1484,7 +1491,7 @@ case $os in
-nova*)
os=-rtmk-nova
;;
- -ns2 )
+ -ns2)
os=-nextstep2
;;
-nsk*)
@@ -1539,6 +1546,19 @@ case $os in
-dicos*)
os=-dicos
;;
+ -pikeos*)
+ # Until real need of OS specific support for
+ # particular features comes up, bare metal
+ # configurations are quite functional.
+ case $basic_machine in
+ arm*)
+ os=-eabi
+ ;;
+ *)
+ os=-elf
+ ;;
+ esac
+ ;;
-nacl*)
;;
-ios)
@@ -1638,6 +1658,9 @@ case $basic_machine in
sparc-* | *-sun)
os=-sunos4.1.1
;;
+ pru-*)
+ os=-elf
+ ;;
*-be)
os=-beos
;;
@@ -1683,7 +1706,7 @@ case $basic_machine in
m88k-omron*)
os=-luna
;;
- *-next )
+ *-next)
os=-nextstep
;;
*-sequent)
@@ -1818,7 +1841,7 @@ echo $basic_machine$os
exit
# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
+# eval: (add-hook 'write-file-functions 'time-stamp)
# time-stamp-start: "timestamp='"
# time-stamp-format: "%:y-%02m-%02d"
# time-stamp-end: "'"
diff --git a/configure b/configure
index d9de8ab..f7b14d9 100755
--- a/configure
+++ b/configure
@@ -554,9 +554,12 @@ if [[ $host_os = mingw* || $host_os = cygwin* ]]; then
compiler_style=MS
CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -I\$(SRCPATH)/extras"
QPRE="-Q"
- `$CC 2>&1 | grep -q IA-32` && host_cpu=i486
- `$CC 2>&1 | grep -q "Intel(R) 64"` && host_cpu=x86_64
- cpp_check "" "" "_MSC_VER >= 1400" || die "Windows Intel Compiler support requires Visual Studio 2005 or newer"
+ cpp_check '' '' '_MSC_VER >= 1400' || die "Windows Intel Compiler support requires Visual Studio 2005 or newer"
+ if cpp_check '' '' 'defined(_M_AMD64) || defined(_M_X64)' ; then
+ host_cpu=x86_64
+ elif cpp_check '' '' 'defined(_M_IX86)' ; then
+ host_cpu=i486
+ fi
if cc_check '' -Qdiag-error:10006,10157 ; then
CHECK_CFLAGS="$CHECK_CFLAGS -Qdiag-error:10006,10157"
fi
@@ -565,9 +568,16 @@ if [[ $host_os = mingw* || $host_os = cygwin* ]]; then
compiler=CL
compiler_style=MS
CFLAGS="$CFLAGS -nologo -GS- -DHAVE_STRING_H -I\$(SRCPATH)/extras"
- `$CC 2>&1 | grep -q 'x86'` && host_cpu=i486
- `$CC 2>&1 | grep -q 'x64'` && host_cpu=x86_64
cpp_check '' '' '_MSC_VER > 1800 || (_MSC_VER == 1800 && _MSC_FULL_VER >= 180030324)' || die "Microsoft Visual Studio support requires Visual Studio 2013 Update 2 or newer"
+ if cpp_check '' '' 'defined(_M_AMD64) || defined(_M_X64)' ; then
+ host_cpu=x86_64
+ elif cpp_check '' '' 'defined(_M_IX86)' ; then
+ host_cpu=i486
+ elif cpp_check '' '' 'defined(_M_ARM64)' ; then
+ host_cpu=aarch64
+ elif cpp_check '' '' 'defined(_M_ARM)' ; then
+ host_cpu=arm
+ fi
else
# MinGW uses broken pre-VS2015 Microsoft printf functions unless it's told to use the POSIX ones.
CFLAGS="$CFLAGS -D_POSIX_C_SOURCE=200112L"
@@ -579,7 +589,7 @@ else
fi
fi
-if [[ "$cc_base" = clang* ]]; then
+if [ $compiler = GNU ]; then
if cc_check '' -Werror=unknown-warning-option ; then
CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-warning-option"
fi
@@ -678,7 +688,7 @@ stack_alignment=4
case $host_cpu in
i*86)
ARCH="X86"
- AS="${AS-yasm}"
+ AS="${AS-nasm}"
AS_EXT=".asm"
ASFLAGS="$ASFLAGS -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/"
if [ $compiler = GNU ]; then
@@ -704,7 +714,7 @@ case $host_cpu in
;;
x86_64)
ARCH="X86_64"
- AS="${AS-yasm}"
+ AS="${AS-nasm}"
AS_EXT=".asm"
ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/"
stack_alignment=16
@@ -853,7 +863,10 @@ if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o
fi
if [ $compiler = GNU -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
- if cc_check '' -mpreferred-stack-boundary=5 ; then
+ if cc_check '' -mpreferred-stack-boundary=6 ; then
+ CFLAGS="$CFLAGS -mpreferred-stack-boundary=6"
+ stack_alignment=64
+ elif cc_check '' -mpreferred-stack-boundary=5 ; then
CFLAGS="$CFLAGS -mpreferred-stack-boundary=5"
stack_alignment=32
elif [ $stack_alignment -lt 16 ] && cc_check '' -mpreferred-stack-boundary=4 ; then
@@ -876,15 +889,14 @@ elif [ $compiler = ICC -a $ARCH = X86 ]; then
fi
if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
- if ! as_check "vpmovzxwd ymm0, xmm0" ; then
+ if ! as_check "vmovdqa32 [eax]{k1}{z}, zmm0" ; then
VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1`
echo "Found $VER"
- echo "Minimum version is yasm-1.2.0"
+ echo "Minimum version is nasm-2.13"
echo "If you really want to compile without asm, configure with --disable-asm."
exit 1
fi
cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' && define HAVE_X86_INLINE_ASM
- ASFLAGS="$ASFLAGS -Worphan-labels"
define HAVE_MMX
fi
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 3fbdd53..036d6c1 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -34,37 +34,23 @@
typedef struct
{
- /* 16x16 */
- int i_rd16x16;
x264_me_t me16x16;
x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
-
- /* 8x8 */
- int i_cost8x8;
- /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
- ALIGNED_4( int16_t mvc[32][5][2] );
x264_me_t me8x8[4];
-
- /* Sub 4x4 */
- int i_cost4x4[4]; /* cost per 8x8 partition */
x264_me_t me4x4[4][4];
-
- /* Sub 8x4 */
- int i_cost8x4[4]; /* cost per 8x8 partition */
x264_me_t me8x4[4][2];
-
- /* Sub 4x8 */
- int i_cost4x8[4]; /* cost per 8x8 partition */
x264_me_t me4x8[4][2];
-
- /* 16x8 */
- int i_cost16x8;
x264_me_t me16x8[2];
-
- /* 8x16 */
- int i_cost8x16;
x264_me_t me8x16[2];
-
+ int i_rd16x16;
+ int i_cost8x8;
+ int i_cost4x4[4]; /* cost per 8x8 partition */
+ int i_cost8x4[4]; /* cost per 8x8 partition */
+ int i_cost4x8[4]; /* cost per 8x8 partition */
+ int i_cost16x8;
+ int i_cost8x16;
+ /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
+ ALIGNED_4( int16_t mvc[32][5][2] );
} x264_mb_analysis_list_t;
typedef struct
@@ -278,29 +264,31 @@ static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
static int init_costs( x264_t *h, float *logs, int qp )
{
- int lambda = x264_lambda_tab[qp];
if( h->cost_mv[qp] )
return 0;
+
+ int mv_range = h->param.analyse.i_mv_range;
+ int lambda = x264_lambda_tab[qp];
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
- CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
- h->cost_mv[qp] += 2*4*2048;
- for( int i = 0; i <= 2*4*2048; i++ )
+ CHECKED_MALLOC( h->cost_mv[qp], (4*4*mv_range + 1) * sizeof(uint16_t) );
+ h->cost_mv[qp] += 2*4*mv_range;
+ for( int i = 0; i <= 2*4*mv_range; i++ )
{
h->cost_mv[qp][-i] =
- h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
+ h->cost_mv[qp][i] = X264_MIN( (int)(lambda * logs[i] + .5f), UINT16_MAX );
}
x264_pthread_mutex_lock( &cost_ref_mutex );
for( int i = 0; i < 3; i++ )
for( int j = 0; j < 33; j++ )
- x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
+ x264_cost_ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0;
x264_pthread_mutex_unlock( &cost_ref_mutex );
if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
{
for( int j = 0; j < 4; j++ )
{
- CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
- h->cost_mv_fpel[qp][j] += 2*2048;
- for( int i = -2*2048; i < 2*2048; i++ )
+ CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*mv_range + 1) * sizeof(uint16_t) );
+ h->cost_mv_fpel[qp][j] += 2*mv_range;
+ for( int i = -2*mv_range; i < 2*mv_range; i++ )
h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
}
}
@@ -314,12 +302,13 @@ fail:
int x264_analyse_init_costs( x264_t *h )
{
- float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) );
+ int mv_range = h->param.analyse.i_mv_range;
+ float *logs = x264_malloc( (2*4*mv_range+1) * sizeof(float) );
if( !logs )
return -1;
logs[0] = 0.718f;
- for( int i = 1; i <= 2*4*2048; i++ )
+ for( int i = 1; i <= 2*4*mv_range; i++ )
logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
@@ -338,13 +327,14 @@ fail:
void x264_analyse_free_costs( x264_t *h )
{
+ int mv_range = h->param.analyse.i_mv_range;
for( int i = 0; i < QP_MAX+1; i++ )
{
if( h->cost_mv[i] )
- x264_free( h->cost_mv[i] - 2*4*2048 );
+ x264_free( h->cost_mv[i] - 2*4*mv_range );
if( h->cost_mv_fpel[i][0] )
for( int j = 0; j < 4; j++ )
- x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
+ x264_free( h->cost_mv_fpel[i][j] - 2*mv_range );
}
}
@@ -465,11 +455,10 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
/* Calculate max allowed MV range */
-#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
- h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
- h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
+ h->mb.mv_min_spel[0] = X264_MAX( h->mb.mv_min[0], -i_fmv_range );
+ h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max[0], i_fmv_range-1 );
if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
{
int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
@@ -513,9 +502,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
mb_y = (h->mb.i_mb_y >> j) + (i == 1);
h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
- h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
- h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
- h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
+ h->mb.mv_miny_spel_row[i] = X264_MAX( h->mb.mv_miny_row[i], -i_fmv_range );
+ h->mb.mv_maxy_spel_row[i] = X264_MIN3( h->mb.mv_maxy_row[i], i_fmv_range-1, 4*thread_mvy_range );
h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
}
@@ -524,9 +512,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
{
h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
- h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
- h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
- h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
+ h->mb.mv_min_spel[1] = X264_MAX( h->mb.mv_min[1], -i_fmv_range );
+ h->mb.mv_max_spel[1] = X264_MIN3( h->mb.mv_max[1], i_fmv_range-1, 4*thread_mvy_range );
h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
}
@@ -541,7 +528,6 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
}
-#undef CLIP_FMV
a->l0.me16x16.cost =
a->l0.i_rd16x16 =
@@ -713,8 +699,12 @@ static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
if( !h->mb.i_psy_rd )
return;
- /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
- h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
+
+ M128( &h->mb.pic.fenc_hadamard_cache[0] ) = M128_ZERO;
+ M128( &h->mb.pic.fenc_hadamard_cache[2] ) = M128_ZERO;
+ M128( &h->mb.pic.fenc_hadamard_cache[4] ) = M128_ZERO;
+ M128( &h->mb.pic.fenc_hadamard_cache[6] ) = M128_ZERO;
+ h->mb.pic.fenc_hadamard_cache[8] = 0;
if( b_satd )
h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
}
@@ -743,8 +733,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
}
- a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
- + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+ a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
+ + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
return;
}
@@ -759,8 +749,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
- satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
- satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+ satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE );
+ satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
for( ; *predict_mode >= 0; predict_mode++ )
{
@@ -788,8 +778,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
}
/* we calculate the cost */
- i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
- h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
+ i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) +
+ h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ) +
a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
a->i_satd_chroma_dir[i_mode] = i_satd;
@@ -845,7 +835,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
if( a->i_satd_i16x16 <= i16x16_thresh )
{
h->predict_16x16[I_PRED_16x16_P]( p_dst );
- a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
+ a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
}
@@ -862,7 +852,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
else
h->predict_16x16[i_mode]( p_dst );
- i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
+ i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ) +
lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
a->i_satd_i16x16_dir[i_mode] = i_satd;
@@ -1065,7 +1055,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
else
h->predict_4x4[i_mode]( p_dst_by );
- i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
+ i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_src_by, FENC_STRIDE, p_dst_by, FDEC_STRIDE );
if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
{
i_satd -= lambda * 3;
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 27052cd..9debd1e 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -801,7 +801,7 @@ void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat
static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
-#if ARCH_X86_64 && HAVE_MMX
+#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_c( h, cb, ctx_block_cat, l );
@@ -915,7 +915,7 @@ void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_
static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
-#if ARCH_X86_64 && HAVE_MMX
+#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l );
@@ -923,7 +923,7 @@ static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t
}
static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
-#if ARCH_X86_64 && HAVE_MMX
+#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l );
@@ -1057,29 +1057,29 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_
src = dst;
#define MUNGE_8x8_NNZ( MUNGE )\
-if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] )\
+if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] && !(h->mb.cbp[h->mb.i_mb_left_xy[0]] & 0x1000) )\
{\
- MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x80 )\
- MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x80 )\
- MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x80 )\
- MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x80 )\
- MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x80 )\
- MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x80 )\
+ MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x00 )\
+ MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x00 )\
+ MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x00 )\
+ MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x00 )\
+ MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x00 )\
+ MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x00 )\
}\
-if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] )\
+if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] && !(h->mb.cbp[h->mb.i_mb_left_xy[1]] & 0x1000) )\
{\
- MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x80 )\
- MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x80 )\
- MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x80 )\
- MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x80 )\
- MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x80 )\
- MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x80 )\
+ MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x00 )\
+ MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x00 )\
+ MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x00 )\
+ MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x00 )\
+ MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x00 )\
+ MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x00 )\
}\
-if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] )\
+if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] && !(h->mb.cbp[h->mb.i_mb_top_xy] & 0x1000) )\
{\
- MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x80808080U )\
- MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x80808080U )\
- MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x80808080U )\
+ MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x00000000U )\
+ MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x00000000U )\
+ MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x00000000U )\
}
MUNGE_8x8_NNZ( BACKUP )
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 27db1bd..d183460 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -444,11 +444,6 @@ static int x264_validate_parameters( x264_t *h, int b_open )
fail = 1;
}
#endif
- if( !fail && !(cpuflags & X264_CPU_CMOV) )
- {
- x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
- fail = 1;
- }
if( fail )
{
x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
@@ -494,7 +489,8 @@ static int x264_validate_parameters( x264_t *h, int b_open )
#endif
if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
{
- x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
+ x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/YUYV/UYVY/"
+ "I444/YV24/BGR/BGRA/RGB supported)\n" );
return -1;
}
@@ -859,6 +855,11 @@ static int x264_validate_parameters( x264_t *h, int b_open )
h->param.analyse.inter &= ~X264_ANALYSE_I8x8;
h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
}
+ if( i_csp >= X264_CSP_I444 && h->param.b_cabac )
+ {
+ /* Disable 8x8dct during 4:4:4+CABAC encoding for compatibility with libavcodec */
+ h->param.analyse.b_transform_8x8 = 0;
+ }
if( h->param.rc.i_rc_method == X264_RC_CQP )
{
float qp_p = h->param.rc.i_qp_constant;
@@ -1170,7 +1171,7 @@ static int x264_validate_parameters( x264_t *h, int b_open )
if( h->param.analyse.i_mv_range <= 0 )
h->param.analyse.i_mv_range = l->mv_range >> PARAM_INTERLACED;
else
- h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> PARAM_INTERLACED);
+ h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 8192 >> PARAM_INTERLACED);
}
h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART );
@@ -1530,6 +1531,12 @@ x264_t *x264_encoder_open( x264_param_t *param )
x264_rdo_init();
/* init CPU functions */
+#if (ARCH_X86 || ARCH_X86_64) && HIGH_BIT_DEPTH
+ /* FIXME: Only 8-bit has been optimized for AVX-512 so far. The few AVX-512 functions
+ * enabled in high bit-depth are insignificant and just causes potential issues with
+ * unnecessary thermal throttling and whatnot, so keep it disabled for now. */
+ h->param.cpu &= ~X264_CPU_AVX512;
+#endif
x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
@@ -1566,9 +1573,15 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
&& (h->param.cpu & X264_CPU_SSE42) )
continue;
+ if( !strcmp(x264_cpu_names[i].name, "LZCNT")
+ && (h->param.cpu & X264_CPU_BMI1) )
+ continue;
if( !strcmp(x264_cpu_names[i].name, "BMI1")
&& (h->param.cpu & X264_CPU_BMI2) )
continue;
+ if( !strcmp(x264_cpu_names[i].name, "FMA4")
+ && (h->param.cpu & X264_CPU_FMA3) )
+ continue;
if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
&& (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
p += sprintf( p, " %s", x264_cpu_names[i].name );
@@ -1580,14 +1593,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( x264_analyse_init_costs( h ) )
goto fail;
- static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 };
- /* Checks for known miscompilation issues. */
- if( h->cost_mv[X264_LOOKAHEAD_QP][2013] != cost_mv_correct[BIT_DEPTH-8] )
- {
- x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
- goto fail;
- }
-
/* Must be volatile or else GCC will optimize it out. */
volatile int temp = 392;
if( x264_clz( temp ) != 23 )
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 87b076f..929fcc8 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -128,8 +128,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
pixel *p_src = h->mb.pic.p_fenc[p];
pixel *p_dst = h->mb.pic.p_fdec[p];
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] );
- ALIGNED_ARRAY_32( dctcoef, dct_dc4x4,[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct_dc4x4,[16] );
int nz, block_cbp = 0;
int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
@@ -283,13 +283,10 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
{
int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
- int ssd[2];
+ ALIGNED_ARRAY_8( int, ssd,[2] );
int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
- int score = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
- if( score < thresh*4 )
- score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
- if( score < thresh*4 )
+ if( h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], ssd ) < thresh*4 )
{
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
@@ -350,7 +347,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
int i_decimate_score = b_decimate ? 0 : 7;
int nz_ac = 0;
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] );
if( h->mb.b_lossless )
{
@@ -561,9 +558,16 @@ void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_m
pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
if( i_mode == I_PRED_4x4_V )
+ {
h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
+ memcpy( p_dst, p_dst-FDEC_STRIDE, 4*sizeof(pixel) );
+ }
else if( i_mode == I_PRED_4x4_H )
+ {
h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
+ for( int i = 0; i < 4; i++ )
+ p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1];
+ }
else
h->predict_4x4[i_mode]( p_dst );
}
@@ -574,9 +578,16 @@ void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_m
pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
if( i_mode == I_PRED_8x8_V )
+ {
h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
+ memcpy( p_dst, &edge[16], 8*sizeof(pixel) );
+ }
else if( i_mode == I_PRED_8x8_H )
+ {
h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
+ for( int i = 0; i < 8; i++ )
+ p_dst[i*FDEC_STRIDE] = edge[14-i];
+ }
else
h->predict_8x8[i_mode]( p_dst, edge );
}
@@ -584,12 +595,21 @@ void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_m
void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
{
int stride = h->fenc->i_stride[p] << MB_INTERLACED;
+ pixel *p_dst = h->mb.pic.p_fdec[p];
+
if( i_mode == I_PRED_16x16_V )
- h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
+ {
+ h->mc.copy[PIXEL_16x16]( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
+ memcpy( p_dst, p_dst-FDEC_STRIDE, 16*sizeof(pixel) );
+ }
else if( i_mode == I_PRED_16x16_H )
- h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
+ {
+ h->mc.copy_16x16_unaligned( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
+ for( int i = 0; i < 16; i++ )
+ p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1];
+ }
else
- h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
+ h->predict_16x16[i_mode]( p_dst );
}
/*****************************************************************************
@@ -780,7 +800,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
else if( h->mb.b_transform_8x8 )
{
- ALIGNED_ARRAY_32( dctcoef, dct8x8,[4],[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct8x8,[4],[64] );
b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
@@ -824,7 +844,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
else
{
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
int quant_cat = p ? CQM_4PC : CQM_4PY;
@@ -965,8 +985,8 @@ void x264_macroblock_encode( x264_t *h )
*****************************************************************************/
static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
{
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] );
- ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dctscan,[16] );
ALIGNED_4( int16_t mvp[2] );
int i_qp = h->mb.i_qp;
@@ -1219,7 +1239,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
int quant_cat = p ? CQM_8PC : CQM_8PY;
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
- ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
@@ -1252,7 +1272,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
int i_decimate_8x8 = b_decimate ? 0 : 4;
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[4],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[4],[16] );
int nnz8x8 = 0;
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
@@ -1311,7 +1331,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
i_qp = h->mb.i_chroma_qp;
for( int ch = 0; ch < 2; ch++ )
{
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[2],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[2],[16] );
pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
@@ -1376,7 +1396,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i
}
else
{
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index 9ab4700..1c901a8 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -116,7 +116,7 @@ static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
- ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
if( b_predict )
{
@@ -154,7 +154,7 @@ static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
- ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
if( b_predict )
diff --git a/encoder/me.c b/encoder/me.c
index 58a39dc..094fc5d 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -1059,7 +1059,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
uint64_t bcostrd = COST_MAX64;
uint16_t amvd;
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
- ALIGNED_ARRAY_32( uint8_t, visited,[8],[8][8] );
+ ALIGNED_ARRAY_64( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
ALIGNED_4( static const int8_t dia4d[33][4] ) =
{
diff --git a/encoder/me.h b/encoder/me.h
index 305c42d..505e3ce 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -32,10 +32,10 @@
typedef struct
{
- /* aligning the first member is a gcc hack to force the struct to be
- * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
+ /* aligning the first member is a gcc hack to force the struct to be aligned,
+ * as well as force sizeof(struct) to be a multiple of the alignment. */
/* input */
- ALIGNED_16( int i_pixel ); /* PIXEL_WxH */
+ ALIGNED_64( int i_pixel ); /* PIXEL_WxH */
uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
int i_ref_cost;
int i_ref;
@@ -53,7 +53,7 @@ typedef struct
int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */
ALIGNED_4( int16_t mv[2] );
-} ALIGNED_16( x264_me_t );
+} ALIGNED_64( x264_me_t );
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
#define x264_me_search( h, m, mvc, i_mvc )\
@@ -66,8 +66,6 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
-extern uint16_t *x264_cost_mv_fpel[QP_MAX+1][4];
-
#define COPY1_IF_LT(x,y)\
if( (y) < (x) )\
(x) = (y);
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 79e7387..5289316 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -420,7 +420,7 @@ static int x264_macroblock_tree_rescale_init( x264_t *h, x264_ratecontrol_t *rc
float dstdim[2] = { h->param.i_width / 16.f, h->param.i_height / 16.f};
int srcdimi[2] = {ceil(srcdim[0]), ceil(srcdim[1])};
int dstdimi[2] = {ceil(dstdim[0]), ceil(dstdim[1])};
- if( PARAM_INTERLACED )
+ if( h->param.b_interlaced || h->param.b_fake_interlaced )
{
srcdimi[1] = (srcdimi[1]+1)&~1;
dstdimi[1] = (dstdimi[1]+1)&~1;
@@ -1469,7 +1469,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
if( h->i_frame == 0 )
{
//384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR
- double fr = 1. / 172;
+ double fr = 1. / (h->param.i_level_idc >= 60 ? 300 : 172);
int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height;
rc->frame_size_maximum = 384 * BIT_DEPTH * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr;
}
diff --git a/encoder/rdo.c b/encoder/rdo.c
index bd2eafb..a6865bd 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -64,9 +64,8 @@ static uint16_t cabac_size_5ones[128];
#include "cabac.c"
#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
- sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) - (CHROMA444 ? 0 : (1024+12)-460) )
-#define COPY_CABAC_PART( pos, size )\
- memcpy( &cb->state[pos], &h->cabac.state[pos], size )
+ sizeof(int) + (CHROMA444 ? 1024+12 : 460) )
+#define COPY_CABAC_PART( pos, size ) memcpy( &cb->state[pos], &h->cabac.state[pos], size )
static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int size, int x, int y )
{
@@ -634,8 +633,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
int b_chroma, int dc, int num_coefs, int idx )
{
- ALIGNED_ARRAY_32( dctcoef, orig_coefs, [64] );
- ALIGNED_ARRAY_32( dctcoef, quant_coefs, [64] );
+ ALIGNED_ARRAY_64( dctcoef, orig_coefs, [64] );
+ ALIGNED_ARRAY_64( dctcoef, quant_coefs, [64] );
const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const int b_interlaced = MB_INTERLACED;
@@ -695,7 +694,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
return !!dct[0];
}
-#if HAVE_MMX && ARCH_X86_64
+#if HAVE_MMX && ARCH_X86_64 && !defined( __MACH__ )
#define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\
cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8)
if( num_coefs == 16 && !dc )
diff --git a/encoder/set.c b/encoder/set.c
index f86189f..2ab4e4e 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -783,23 +783,26 @@ int x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len )
const x264_level_t x264_levels[] =
{
- { 10, 1485, 99, 396, 64, 175, 64, 64, 0, 2, 0, 0, 1 },
- { 9, 1485, 99, 396, 128, 350, 64, 64, 0, 2, 0, 0, 1 }, /* "1b" */
- { 11, 3000, 396, 900, 192, 500, 128, 64, 0, 2, 0, 0, 1 },
- { 12, 6000, 396, 2376, 384, 1000, 128, 64, 0, 2, 0, 0, 1 },
- { 13, 11880, 396, 2376, 768, 2000, 128, 64, 0, 2, 0, 0, 1 },
- { 20, 11880, 396, 2376, 2000, 2000, 128, 64, 0, 2, 0, 0, 1 },
- { 21, 19800, 792, 4752, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 },
- { 22, 20250, 1620, 8100, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 },
- { 30, 40500, 1620, 8100, 10000, 10000, 256, 32, 22, 2, 0, 1, 0 },
- { 31, 108000, 3600, 18000, 14000, 14000, 512, 16, 60, 4, 1, 1, 0 },
- { 32, 216000, 5120, 20480, 20000, 20000, 512, 16, 60, 4, 1, 1, 0 },
- { 40, 245760, 8192, 32768, 20000, 25000, 512, 16, 60, 4, 1, 1, 0 },
- { 41, 245760, 8192, 32768, 50000, 62500, 512, 16, 24, 2, 1, 1, 0 },
- { 42, 522240, 8704, 34816, 50000, 62500, 512, 16, 24, 2, 1, 1, 1 },
- { 50, 589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
- { 51, 983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
- { 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
+ { 10, 1485, 99, 396, 64, 175, 64, 64, 0, 2, 0, 0, 1 },
+ { 9, 1485, 99, 396, 128, 350, 64, 64, 0, 2, 0, 0, 1 }, /* "1b" */
+ { 11, 3000, 396, 900, 192, 500, 128, 64, 0, 2, 0, 0, 1 },
+ { 12, 6000, 396, 2376, 384, 1000, 128, 64, 0, 2, 0, 0, 1 },
+ { 13, 11880, 396, 2376, 768, 2000, 128, 64, 0, 2, 0, 0, 1 },
+ { 20, 11880, 396, 2376, 2000, 2000, 128, 64, 0, 2, 0, 0, 1 },
+ { 21, 19800, 792, 4752, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 },
+ { 22, 20250, 1620, 8100, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 },
+ { 30, 40500, 1620, 8100, 10000, 10000, 256, 32, 22, 2, 0, 1, 0 },
+ { 31, 108000, 3600, 18000, 14000, 14000, 512, 16, 60, 4, 1, 1, 0 },
+ { 32, 216000, 5120, 20480, 20000, 20000, 512, 16, 60, 4, 1, 1, 0 },
+ { 40, 245760, 8192, 32768, 20000, 25000, 512, 16, 60, 4, 1, 1, 0 },
+ { 41, 245760, 8192, 32768, 50000, 62500, 512, 16, 24, 2, 1, 1, 0 },
+ { 42, 522240, 8704, 34816, 50000, 62500, 512, 16, 24, 2, 1, 1, 1 },
+ { 50, 589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
+ { 51, 983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
+ { 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
+ { 60, 4177920, 139264, 696320, 240000, 240000, 8192, 16, 24, 2, 1, 1, 1 },
+ { 61, 8355840, 139264, 696320, 480000, 480000, 8192, 16, 24, 2, 1, 1, 1 },
+ { 62, 16711680, 139264, 696320, 800000, 800000, 8192, 16, 24, 2, 1, 1, 1 },
{ 0 }
};
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index b20bbf3..6c0aaa8 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -267,7 +267,7 @@ static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t
int i_lines = fenc->i_lines[p];
int i_width = fenc->i_width[p];
pixel *src = fenc->plane[p];
- ALIGNED_ARRAY_16( pixel, buf, [16*16] );
+ ALIGNED_ARRAY_64( pixel, buf, [16*16] );
int pixoff = 0;
if( w )
{
@@ -544,17 +544,18 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
if( p0 == p1 )
goto lowres_intra_mb;
+ int mv_range = 2 * h->param.analyse.i_mv_range;
// no need for h->mb.mv_min[]
- h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4;
- h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
- h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 );
- h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 );
+ h->mb.mv_min_spel[0] = X264_MAX( 4*(-8*h->mb.i_mb_x - 12), -mv_range );
+ h->mb.mv_max_spel[0] = X264_MIN( 4*(8*(h->mb.i_mb_width - h->mb.i_mb_x - 1) + 12), mv_range-1 );
+ h->mb.mv_limit_fpel[0][0] = h->mb.mv_min_spel[0] >> 2;
+ h->mb.mv_limit_fpel[1][0] = h->mb.mv_max_spel[0] >> 2;
if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
{
- h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4;
- h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
- h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 );
- h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 );
+ h->mb.mv_min_spel[1] = X264_MAX( 4*(-8*h->mb.i_mb_y - 12), -mv_range );
+ h->mb.mv_max_spel[1] = X264_MIN( 4*(8*( h->mb.i_mb_height - h->mb.i_mb_y - 1) + 12), mv_range-1 );
+ h->mb.mv_limit_fpel[0][1] = h->mb.mv_min_spel[1] >> 2;
+ h->mb.mv_limit_fpel[1][1] = h->mb.mv_max_spel[1] >> 2;
}
#define LOAD_HPELS_LUMA(dst, src) \
@@ -728,13 +729,13 @@ lowres_intra_mb:
if( h->param.analyse.i_subpel_refine > 1 )
{
h->predict_8x8c[I_PRED_CHROMA_P]( pix );
- int satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+ int satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE );
i_icost = X264_MIN( i_icost, satd );
h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
for( int i = 3; i < 9; i++ )
{
h->predict_8x8[i]( pix, edge );
- satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+ satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE );
i_icost = X264_MIN( i_icost, satd );
}
}
diff --git a/filters/video/resize.c b/filters/video/resize.c
index 0bacb5b..0d6bd8c 100644
--- a/filters/video/resize.c
+++ b/filters/video/resize.c
@@ -154,10 +154,12 @@ static int convert_csp_to_pix_fmt( int csp )
case X264_CSP_RGB: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_RGB48 : AV_PIX_FMT_RGB24;
case X264_CSP_BGR: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGR48 : AV_PIX_FMT_BGR24;
case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGRA64 : AV_PIX_FMT_BGRA;
- /* the next csp has no equivalent 16bit depth in swscale */
+ /* the following has no equivalent 16-bit depth in swscale */
case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV12;
case X264_CSP_NV21: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV21;
- /* the next csp is no supported by swscale at all */
+ case X264_CSP_YUYV: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_YUYV422;
+ case X264_CSP_UYVY: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_UYVY422;
+ /* the following is not supported by swscale at all */
case X264_CSP_NV16:
default: return AV_PIX_FMT_NONE;
}
diff --git a/input/input.c b/input/input.c
index db29a54..335f601 100644
--- a/input/input.c
+++ b/input/input.c
@@ -43,6 +43,8 @@ const x264_cli_csp_t x264_cli_csps[] = {
[X264_CSP_NV12] = { "nv12", 2, { 1, 1 }, { 1, .5 }, 2, 2 },
[X264_CSP_NV21] = { "nv21", 2, { 1, 1 }, { 1, .5 }, 2, 2 },
[X264_CSP_NV16] = { "nv16", 2, { 1, 1 }, { 1, 1 }, 2, 1 },
+ [X264_CSP_YUYV] = { "yuyv", 1, { 2 }, { 1 }, 2, 1 },
+ [X264_CSP_UYVY] = { "uyvy", 1, { 2 }, { 1 }, 2, 1 },
[X264_CSP_BGR] = { "bgr", 1, { 3 }, { 1 }, 1, 1 },
[X264_CSP_BGRA] = { "bgra", 1, { 4 }, { 1 }, 1, 1 },
[X264_CSP_RGB] = { "rgb", 1, { 3 }, { 1 }, 1, 1 },
diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm
index d1da9b7..a9f7493 100644
--- a/tools/checkasm-a.asm
+++ b/tools/checkasm-a.asm
@@ -225,3 +225,14 @@ cglobal stack_pagealign, 2,2
leave
RET
+; Trigger a warmup of vector units
+%macro WARMUP 0
+cglobal checkasm_warmup, 0,0
+ xorps m0, m0
+ RET
+%endmacro
+
+INIT_YMM avx
+WARMUP
+INIT_ZMM avx512
+WARMUP
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 4657bba..e25a45c 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -57,8 +57,7 @@ int quiet = 0;
if( !ok ) ret = -1; \
}
-#define BENCH_RUNS 100 // tradeoff between accuracy and speed
-#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
+#define BENCH_RUNS 2000 // tradeoff between accuracy and speed
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
#define MAX_CPUS 30 // number of different combinations of cpu flags
@@ -178,7 +177,10 @@ static void print_bench(void)
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
#if HAVE_MMX
+ b->cpu&X264_CPU_AVX512 ? "avx512" :
b->cpu&X264_CPU_AVX2 ? "avx2" :
+ b->cpu&X264_CPU_BMI2 ? "bmi2" :
+ b->cpu&X264_CPU_BMI1 ? "bmi1" :
b->cpu&X264_CPU_FMA3 ? "fma3" :
b->cpu&X264_CPU_FMA4 ? "fma4" :
b->cpu&X264_CPU_XOP ? "xop" :
@@ -187,6 +189,7 @@ static void print_bench(void)
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
+ b->cpu&X264_CPU_LZCNT ? "lzcnt" :
/* print sse2slow only if there's also a sse2fast version of the same func */
b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
@@ -209,10 +212,7 @@ static void print_bench(void)
b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
- b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
- b->cpu&X264_CPU_BMI2 ? "_bmi2" :
- b->cpu&X264_CPU_BMI1 ? "_bmi1" :
- b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
+ b->cpu&X264_CPU_LZCNT && b->cpu&X264_CPU_SSE3 && !(b->cpu&X264_CPU_BMI1) ? "_lzcnt" :
b->cpu&X264_CPU_SLOW_ATOM ? "_atom" :
#elif ARCH_ARM
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
@@ -222,8 +222,18 @@ static void print_bench(void)
}
}
+/* YMM and ZMM registers on x86 are turned off to save power when they haven't been
+ * used for some period of time. When they are used there will be a "warmup" period
+ * during which performance will be reduced and inconsistent which is problematic when
+ * trying to benchmark individual functions. We can work around this by periodically
+ * issuing "dummy" instructions that uses those registers to keep them powered on. */
+static void (*simd_warmup_func)( void ) = NULL;
+#define simd_warmup() do { if( simd_warmup_func ) simd_warmup_func(); } while( 0 )
+
#if ARCH_X86 || ARCH_X86_64
int x264_stack_pagealign( int (*func)(), int align );
+void x264_checkasm_warmup_avx( void );
+void x264_checkasm_warmup_avx512( void );
/* detect when callee-saved regs aren't saved
* needs an explicit asm check because it only sometimes crashes in normal use. */
@@ -258,6 +268,7 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... );
#define call_a1(func,...) ({ \
uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
+ simd_warmup(); \
x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
#elif ARCH_AARCH64 && !defined(__APPLE__)
void x264_checkasm_stack_clobber( uint64_t clobber, ... );
@@ -285,6 +296,7 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... );
call_a1(func, __VA_ARGS__);\
for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
{\
+ simd_warmup();\
uint32_t t = read_time();\
func(__VA_ARGS__);\
func(__VA_ARGS__);\
@@ -358,8 +370,9 @@ static int check_pixel( int cpu_ref, int cpu_new )
used_asm = 1; \
for( int j = 0; j < 64; j++ ) \
{ \
- res_c = call_c( pixel_c.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
- res_asm = call_a( pixel_asm.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
+ intptr_t stride1 = (j&31) == 31 ? 32 : FENC_STRIDE; \
+ res_c = call_c( pixel_c.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
+ res_asm = call_a( pixel_asm.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
@@ -494,15 +507,17 @@ static int check_pixel( int cpu_ref, int cpu_new )
#define TEST_PIXEL_VAR2( i ) \
if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \
{ \
- int res_c, res_asm, ssd_c, ssd_asm; \
+ int res_c, res_asm; \
+ ALIGNED_ARRAY_8( int, ssd_c, [2] ); \
+ ALIGNED_ARRAY_8( int, ssd_asm,[2] ); \
set_func_name( "%s_%s", "var2", pixel_names[i] ); \
used_asm = 1; \
- res_c = call_c( pixel_c.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_c ); \
- res_asm = call_a( pixel_asm.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_asm ); \
- if( res_c != res_asm || ssd_c != ssd_asm ) \
+ res_c = call_c( pixel_c.var2[i], pbuf1, pbuf2, ssd_c ); \
+ res_asm = call_a( pixel_asm.var2[i], pbuf1, pbuf2, ssd_asm ); \
+ if( res_c != res_asm || memcmp( ssd_c, ssd_asm, 2*sizeof(int) ) ) \
{ \
ok = 0; \
- fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \
+ fprintf( stderr, "var2[%d]: {%d, %d, %d} != {%d, %d, %d} [FAILED]\n", i, res_c, ssd_c[0], ssd_c[1], res_asm, ssd_asm[0], ssd_asm[1] ); \
} \
}
@@ -827,10 +842,10 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_dct_function_t dct_asm;
x264_quant_function_t qf;
int ret = 0, ok, used_asm, interlace = 0;
- ALIGNED_ARRAY_32( dctcoef, dct1, [16],[16] );
- ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] );
- ALIGNED_ARRAY_32( dctcoef, dct4, [16],[16] );
- ALIGNED_ARRAY_32( dctcoef, dct8, [4],[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct2, [16],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] );
+ ALIGNED_ARRAY_64( dctcoef, dct8, [4],[64] );
ALIGNED_16( dctcoef dctdc[2][8] );
x264_t h_buf;
x264_t *h = &h_buf;
@@ -1032,8 +1047,8 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_zigzag_function_t zigzag_ref[2];
x264_zigzag_function_t zigzag_asm[2];
- ALIGNED_ARRAY_16( dctcoef, level1,[64] );
- ALIGNED_ARRAY_16( dctcoef, level2,[64] );
+ ALIGNED_ARRAY_64( dctcoef, level1,[64] );
+ ALIGNED_ARRAY_64( dctcoef, level2,[64] );
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
@@ -1526,6 +1541,33 @@ static int check_mc( int cpu_ref, int cpu_new )
}
}
+ if( mc_a.plane_copy_deinterleave_yuyv != mc_ref.plane_copy_deinterleave_yuyv )
+ {
+ set_func_name( "plane_copy_deinterleave_yuyv" );
+ used_asm = 1;
+ for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
+ {
+ int w = (plane_specs[i].w + 1) >> 1;
+ int h = plane_specs[i].h;
+ intptr_t dst_stride = ALIGN( w, 32/sizeof(pixel) );
+ intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1;
+ intptr_t offv = dst_stride*h;
+ pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
+ memset( pbuf3, 0, 0x1000 );
+ memset( pbuf4, 0, 0x1000 );
+ /* Skip benchmarking since it's the same as plane_copy_deinterleave(), just verify correctness. */
+ call_c1( mc_c.plane_copy_deinterleave_yuyv, pbuf3, dst_stride, pbuf3+offv, dst_stride, src1, src_stride, w, h );
+ call_a1( mc_a.plane_copy_deinterleave_yuyv, pbuf4, dst_stride, pbuf4+offv, dst_stride, src1, src_stride, w, h );
+ for( int y = 0; y < h; y++ )
+ if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) ||
+ memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(pixel) ) )
+ {
+ fprintf( stderr, "plane_copy_deinterleave_yuyv FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
+ break;
+ }
+ }
+ }
+
if( mc_a.plane_copy_deinterleave_rgb != mc_ref.plane_copy_deinterleave_rgb )
{
set_func_name( "plane_copy_deinterleave_rgb" );
@@ -1566,7 +1608,7 @@ static int check_mc( int cpu_ref, int cpu_new )
{
int w = (plane_specs[i].w + 1) >> 1;
int h = plane_specs[i].h;
- intptr_t dst_stride = ALIGN( w, 16 );
+ intptr_t dst_stride = ALIGN( w, 32 );
intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
intptr_t offv = dst_stride*h + 32;
memset( pbuf3, 0, 0x1000 );
@@ -1704,7 +1746,7 @@ static int check_mc( int cpu_ref, int cpu_new )
{
ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
if( !ok )
- fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
+ fprintf( stderr, "mbtree_propagate_cost FAILED: %d !~= %d\n", dstc[j], dsta[j] );
}
}
}
@@ -1723,15 +1765,16 @@ static int check_mc( int cpu_ref, int cpu_new )
h.mb.i_mb_width = width;
h.mb.i_mb_height = height;
- uint16_t *ref_costsc = (uint16_t*)buf3;
- uint16_t *ref_costsa = (uint16_t*)buf4;
- int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size);
+ uint16_t *ref_costsc = (uint16_t*)buf3 + width;
+ uint16_t *ref_costsa = (uint16_t*)buf4 + width;
+ int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + width + size);
int16_t *propagate_amount = (int16_t*)(mvs + width);
uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width);
- h.scratch_buffer2 = (uint8_t*)(ref_costsa + size);
+ h.scratch_buffer2 = (uint8_t*)(ref_costsa + width + size);
int bipred_weight = (rand()%63)+1;
+ int mb_y = rand()&3;
int list = i&1;
- for( int j = 0; j < size; j++ )
+ for( int j = -width; j < size+width; j++ )
ref_costsc[j] = ref_costsa[j] = rand()&32767;
for( int j = 0; j < width; j++ )
{
@@ -1742,18 +1785,18 @@ static int check_mc( int cpu_ref, int cpu_new )
lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT;
}
- call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
- call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+ call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
+ call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
- for( int j = 0; j < size && ok; j++ )
+ for( int j = -width; j < size+width && ok; j++ )
{
ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1;
if( !ok )
fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] );
}
- call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
- call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+ call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
+ call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
}
}
@@ -1816,12 +1859,14 @@ static int check_mc( int cpu_ref, int cpu_new )
{
set_func_name( "memcpy_aligned" );
ok = 1; used_asm = 1;
- for( size_t size = 16; size < 256; size += 16 )
+ for( size_t size = 16; size < 512; size += 16 )
{
- memset( buf4, 0xAA, size + 1 );
+ for( int i = 0; i < size; i++ )
+ buf1[i] = rand();
+ memset( buf4-1, 0xAA, size + 2 );
call_c( mc_c.memcpy_aligned, buf3, buf1, size );
call_a( mc_a.memcpy_aligned, buf4, buf1, size );
- if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
+ if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
{
ok = 0;
fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size );
@@ -1837,10 +1882,10 @@ static int check_mc( int cpu_ref, int cpu_new )
ok = 1; used_asm = 1;
for( size_t size = 128; size < 1024; size += 128 )
{
- memset( buf4, 0xAA, size + 1 );
+ memset( buf4-1, 0xAA, size + 2 );
call_c( mc_c.memzero_aligned, buf3, size );
call_a( mc_a.memzero_aligned, buf4, size );
- if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
+ if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
{
ok = 0;
fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size );
@@ -1920,9 +1965,12 @@ static int check_deblock( int cpu_ref, int cpu_new )
if( db_a.deblock_strength != db_ref.deblock_strength )
{
+ set_func_name( "deblock_strength" );
+ used_asm = 1;
for( int i = 0; i < 100; i++ )
{
- ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
+ ALIGNED_ARRAY_16( uint8_t, nnz_buf, [X264_SCAN8_SIZE+8] );
+ uint8_t *nnz = &nnz_buf[8];
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] );
@@ -1934,9 +1982,8 @@ static int check_deblock( int cpu_ref, int cpu_new )
{
ref[j][k] = ((rand()&3) != 3) ? 0 : (rand() & 31) - 2;
for( int l = 0; l < 2; l++ )
- mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512;
+ mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&16383) - 8192;
}
- set_func_name( "deblock_strength" );
call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) )
@@ -1969,8 +2016,8 @@ static int check_quant( int cpu_ref, int cpu_new )
x264_quant_function_t qf_c;
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
- ALIGNED_ARRAY_32( dctcoef, dct1,[64] );
- ALIGNED_ARRAY_32( dctcoef, dct2,[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct1,[64] );
+ ALIGNED_ARRAY_64( dctcoef, dct2,[64] );
ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] );
@@ -2214,7 +2261,7 @@ static int check_quant( int cpu_ref, int cpu_new )
int max = X264_MIN( i, PIXEL_MAX*16 ); \
for( int j = 0; j < size; j++ ) \
dct1[j] = rand()%(max*2+1) - max; \
- for( int j = 0; i <= size; j += 4 ) \
+ for( int j = 0; j <= size; j += 4 ) \
qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \
memcpy( dct2, dct1, size*sizeof(dctcoef) ); \
res_c = call_c1( qf_c.optname, dct1, dmf ); \
@@ -2575,6 +2622,11 @@ static int check_cabac( int cpu_ref, int cpu_new )
x264_quant_init( &h, cpu_new, &h.quantf );
h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4;
+/* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */
+#define GET_CB( i ) (\
+ x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\
+ cb[i].f8_bits_encoded = 0, &cb[i] )
+
#define CABAC_RESIDUAL(name, start, end, rd)\
{\
if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
@@ -2587,7 +2639,7 @@ static int check_cabac( int cpu_ref, int cpu_new )
{\
for( int j = 0; j < 256; j++ )\
{\
- ALIGNED_ARRAY_32( dctcoef, dct, [2],[64] );\
+ ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\
uint8_t bitstream[2][1<<16];\
static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
int ac = ctx_ac[ctx_block_cat];\
@@ -2610,13 +2662,9 @@ static int check_cabac( int cpu_ref, int cpu_new )
x264_cabac_t cb[2];\
x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
- x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\
- x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\
- cb[0].f8_bits_encoded = 0;\
- cb[1].f8_bits_encoded = 0;\
if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
- call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
- call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+ call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
+ call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
if( !ok )\
@@ -2629,8 +2677,8 @@ static int check_cabac( int cpu_ref, int cpu_new )
}\
if( (j&15) == 0 )\
{\
- call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
- call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+ call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
+ call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
}\
}\
}\
@@ -2757,6 +2805,14 @@ static int check_all_flags( void )
int ret = 0;
int cpu0 = 0, cpu1 = 0;
uint32_t cpu_detect = x264_cpu_detect();
+#if ARCH_X86 || ARCH_X86_64
+ if( cpu_detect & X264_CPU_AVX512 )
+ simd_warmup_func = x264_checkasm_warmup_avx512;
+ else if( cpu_detect & X264_CPU_AVX )
+ simd_warmup_func = x264_checkasm_warmup_avx;
+#endif
+ simd_warmup();
+
#if HAVE_MMX
if( cpu_detect & X264_CPU_MMX2 )
{
@@ -2767,13 +2823,6 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
cpu1 &= ~X264_CPU_CACHELINE_32;
#endif
- if( cpu_detect & X264_CPU_LZCNT )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" );
- cpu1 &= ~X264_CPU_LZCNT;
- }
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
- cpu1 &= ~X264_CPU_SLOW_CTZ;
}
if( cpu_detect & X264_CPU_SSE )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
@@ -2785,13 +2834,11 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
- cpu1 &= ~X264_CPU_SLOW_CTZ;
- if( cpu_detect & X264_CPU_LZCNT )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" );
- cpu1 &= ~X264_CPU_LZCNT;
- }
+ }
+ if( cpu_detect & X264_CPU_LZCNT )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "LZCNT" );
+ cpu1 &= ~X264_CPU_LZCNT;
}
if( cpu_detect & X264_CPU_SSE3 )
{
@@ -2805,8 +2852,6 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
- cpu1 &= ~X264_CPU_SLOW_CTZ;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
cpu1 &= ~X264_CPU_CACHELINE_64;
@@ -2831,29 +2876,15 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_FMA4;
}
if( cpu_detect & X264_CPU_FMA3 )
- {
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
- cpu1 &= ~X264_CPU_FMA3;
- }
- if( cpu_detect & X264_CPU_AVX2 )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" );
- if( cpu_detect & X264_CPU_LZCNT )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" );
- cpu1 &= ~X264_CPU_LZCNT;
- }
- }
if( cpu_detect & X264_CPU_BMI1 )
- {
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
- cpu1 &= ~X264_CPU_BMI1;
- }
if( cpu_detect & X264_CPU_BMI2 )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
- cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2);
- }
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
+ if( cpu_detect & X264_CPU_AVX2 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+ if( cpu_detect & X264_CPU_AVX512 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" );
#elif ARCH_PPC
if( cpu_detect & X264_CPU_ALTIVEC )
{
@@ -2883,8 +2914,6 @@ static int check_all_flags( void )
int main(int argc, char *argv[])
{
- int ret = 0;
-
#ifdef _WIN32
/* Disable the Windows Error Reporting dialog */
SetErrorMode( SEM_NOGPFAULTERRORBOX );
@@ -2910,8 +2939,8 @@ int main(int argc, char *argv[])
fprintf( stderr, "x264: using random seed %u\n", seed );
srand( seed );
- buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS );
- pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS );
+ buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) );
+ pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) );
if( !buf1 || !pbuf1 )
{
fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
@@ -2932,21 +2961,7 @@ int main(int argc, char *argv[])
}
memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
- /* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
- if( do_bench )
- for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
- {
- INIT_POINTER_OFFSETS;
- ret |= x264_stack_pagealign( check_all_flags, i*32 );
- buf1 += 32;
- pbuf1 += 32;
- quiet = 1;
- fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
- }
- else
- ret = x264_stack_pagealign( check_all_flags, 0 );
-
- if( ret )
+ if( x264_stack_pagealign( check_all_flags, 0 ) )
{
fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
return -1;
diff --git a/version.sh b/version.sh
index 60e8e07..30d4cd2 100755
--- a/version.sh
+++ b/version.sh
@@ -1,5 +1,5 @@
#!/bin/sh
# Script modified from upstream source for Debian packaging since packaging
# won't include .git repository.
-echo '#define X264_VERSION " r2795 aaa9aa8"'
-echo '#define X264_POINTVER "0.148.2795 aaa9aa8"'
+echo '#define X264_VERSION " r2854 e9a5903"'
+echo '#define X264_POINTVER "0.152.2854 e9a5903"'
diff --git a/x264.h b/x264.h
index 18f5796..0f34067 100644
--- a/x264.h
+++ b/x264.h
@@ -45,7 +45,7 @@ extern "C" {
#include "x264_config.h"
-#define X264_BUILD 148
+#define X264_BUILD 152
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
@@ -119,39 +119,38 @@ typedef struct x264_nal_t
/* CPU flags */
/* x86 */
-#define X264_CPU_CMOV 0x0000001
-#define X264_CPU_MMX 0x0000002
-#define X264_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */
-#define X264_CPU_MMXEXT X264_CPU_MMX2
-#define X264_CPU_SSE 0x0000008
-#define X264_CPU_SSE2 0x0000010
-#define X264_CPU_SSE3 0x0000020
-#define X264_CPU_SSSE3 0x0000040
-#define X264_CPU_SSE4 0x0000080 /* SSE4.1 */
-#define X264_CPU_SSE42 0x0000100 /* SSE4.2 */
-#define X264_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */
-#define X264_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */
-#define X264_CPU_XOP 0x0000800 /* AMD XOP */
-#define X264_CPU_FMA4 0x0001000 /* AMD FMA4 */
-#define X264_CPU_FMA3 0x0002000 /* FMA3 */
-#define X264_CPU_AVX2 0x0004000 /* AVX2 */
-#define X264_CPU_BMI1 0x0008000 /* BMI1 */
-#define X264_CPU_BMI2 0x0010000 /* BMI2 */
+#define X264_CPU_MMX (1<<0)
+#define X264_CPU_MMX2 (1<<1) /* MMX2 aka MMXEXT aka ISSE */
+#define X264_CPU_MMXEXT X264_CPU_MMX2
+#define X264_CPU_SSE (1<<2)
+#define X264_CPU_SSE2 (1<<3)
+#define X264_CPU_LZCNT (1<<4)
+#define X264_CPU_SSE3 (1<<5)
+#define X264_CPU_SSSE3 (1<<6)
+#define X264_CPU_SSE4 (1<<7) /* SSE4.1 */
+#define X264_CPU_SSE42 (1<<8) /* SSE4.2 */
+#define X264_CPU_AVX (1<<9) /* Requires OS support even if YMM registers aren't used */
+#define X264_CPU_XOP (1<<10) /* AMD XOP */
+#define X264_CPU_FMA4 (1<<11) /* AMD FMA4 */
+#define X264_CPU_FMA3 (1<<12)
+#define X264_CPU_BMI1 (1<<13)
+#define X264_CPU_BMI2 (1<<14)
+#define X264_CPU_AVX2 (1<<15)
+#define X264_CPU_AVX512 (1<<16) /* AVX-512 {F, CD, BW, DQ, VL}, requires OS support */
/* x86 modifiers */
-#define X264_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */
-#define X264_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */
-#define X264_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */
-#define X264_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */
-#define X264_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
-#define X264_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */
-#define X264_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */
-#define X264_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow
+#define X264_CPU_CACHELINE_32 (1<<17) /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64 (1<<18) /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_SSE2_IS_SLOW (1<<19) /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST (1<<20) /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SLOW_SHUFFLE (1<<21) /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
+#define X264_CPU_STACK_MOD4 (1<<22) /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SLOW_ATOM (1<<23) /* The Atom is terrible: slow SSE unaligned loads, slow
* SIMD multiplies, slow SIMD variable shifts, slow pshufb,
* cacheline split penalties -- gather everything here that
* isn't shared by other CPUs to avoid making half a dozen
* new SLOW flags. */
-#define X264_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */
-#define X264_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */
+#define X264_CPU_SLOW_PSHUFB (1<<24) /* such as on the Intel Atom */
+#define X264_CPU_SLOW_PALIGNR (1<<25) /* such as on the AMD Bobcat */
/* PowerPC */
#define X264_CPU_ALTIVEC 0x0000001
@@ -227,13 +226,15 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
#define X264_CSP_I422 0x0005 /* yuv 4:2:2 planar */
#define X264_CSP_YV16 0x0006 /* yvu 4:2:2 planar */
#define X264_CSP_NV16 0x0007 /* yuv 4:2:2, with one y plane and one packed u+v */
-#define X264_CSP_V210 0x0008 /* 10-bit yuv 4:2:2 packed in 32 */
-#define X264_CSP_I444 0x0009 /* yuv 4:4:4 planar */
-#define X264_CSP_YV24 0x000a /* yvu 4:4:4 planar */
-#define X264_CSP_BGR 0x000b /* packed bgr 24bits */
-#define X264_CSP_BGRA 0x000c /* packed bgr 32bits */
-#define X264_CSP_RGB 0x000d /* packed rgb 24bits */
-#define X264_CSP_MAX 0x000e /* end of list */
+#define X264_CSP_YUYV 0x0008 /* yuyv 4:2:2 packed */
+#define X264_CSP_UYVY 0x0009 /* uyvy 4:2:2 packed */
+#define X264_CSP_V210 0x000a /* 10-bit yuv 4:2:2 packed in 32 */
+#define X264_CSP_I444 0x000b /* yuv 4:4:4 planar */
+#define X264_CSP_YV24 0x000c /* yvu 4:4:4 planar */
+#define X264_CSP_BGR 0x000d /* packed bgr 24bits */
+#define X264_CSP_BGRA 0x000e /* packed bgr 32bits */
+#define X264_CSP_RGB 0x000f /* packed rgb 24bits */
+#define X264_CSP_MAX 0x0010 /* end of list */
#define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */
#define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */
@@ -563,19 +564,19 @@ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal );
typedef struct x264_level_t
{
- int level_idc;
- int mbps; /* max macroblock processing rate (macroblocks/sec) */
- int frame_size; /* max frame size (macroblocks) */
- int dpb; /* max decoded picture buffer (mbs) */
- int bitrate; /* max bitrate (kbit/sec) */
- int cpb; /* max vbv buffer (kbit) */
- int mv_range; /* max vertical mv component range (pixels) */
- int mvs_per_2mb; /* max mvs per 2 consecutive mbs. */
- int slice_rate; /* ?? */
- int mincr; /* min compression ratio */
- int bipred8x8; /* limit bipred to >=8x8 */
- int direct8x8; /* limit b_direct to >=8x8 */
- int frame_only; /* forbid interlacing */
+ uint8_t level_idc;
+ uint32_t mbps; /* max macroblock processing rate (macroblocks/sec) */
+ uint32_t frame_size; /* max frame size (macroblocks) */
+ uint32_t dpb; /* max decoded picture buffer (mbs) */
+ uint32_t bitrate; /* max bitrate (kbit/sec) */
+ uint32_t cpb; /* max vbv buffer (kbit) */
+ uint16_t mv_range; /* max vertical mv component range (pixels) */
+ uint8_t mvs_per_2mb; /* max mvs per 2 consecutive mbs. */
+ uint8_t slice_rate; /* ?? */
+ uint8_t mincr; /* min compression ratio */
+ uint8_t bipred8x8; /* limit bipred to >=8x8 */
+ uint8_t direct8x8; /* limit b_direct to >=8x8 */
+ uint8_t frame_only; /* forbid interlacing */
} x264_level_t;
/* all of the levels defined in the standard, terminated by .level_idc=0 */
--
x264 packaging
More information about the pkg-multimedia-commits
mailing list