[SCM] libav/experimental: and another +2% speedup

siretart at users.alioth.debian.org siretart at users.alioth.debian.org
Sun Jun 30 15:30:15 UTC 2013


The following commit has been merged in the experimental branch:
commit ac0b0b2f6d2a7a528a04353e1d8e1f42166525c2
Author: Michael Niedermayer <michaelni at gmx.at>
Date:   Tue Oct 23 12:05:34 2001 +0000

    and another +2% speedup
    
    Originally committed as revision 2417 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

diff --git a/postproc/postprocess.c b/postproc/postprocess.c
index b32d3fe..5ed2477 100644
--- a/postproc/postprocess.c
+++ b/postproc/postprocess.c
@@ -336,7 +336,7 @@ static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
 		: "=r" (isOk)
 		: "r" (src), "r" (stride)
 		);
-	return isOk ? 1 : 0;
+	return isOk;
 #else
 
 	int isOk2= 1;
@@ -1303,40 +1303,60 @@ static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
 #ifdef HAVE_MMX
 asm volatile (
 //		"int $3 \n\t"
-		"pushl %1\n\t"
+		"leal (%1, %2), %%ecx				\n\t"
+		"leal (%%ecx, %2, 4), %%ebx			\n\t"
+//	0	1	2	3	4	5	6	7	8	9
+//	%1	ecx	ecx+%2	ecx+2%2	%1+4%2	ebx	ebx+%2	ebx+2%2	%1+8%2	ebx+4%2
 		"movq b7E, %%mm7				\n\t" // mm7 = 0x7F
 		"movq b7C, %%mm6				\n\t" // mm6 = 0x7D
-		"leal tempBlock, %%eax				\n\t"
 		"pxor %%mm0, %%mm0				\n\t"
+		"movl %1, %%eax					\n\t"
+		"andl $0x1F, %%eax				\n\t"
+		"cmpl $24, %%eax				\n\t"
+		"leal tempBlock, %%eax				\n\t"
+		"jb 1f						\n\t"
 
-#define HDC_CHECK_AND_CPY(i) \
-		"movq -4(%1), %%mm2				\n\t"\
-		"psrlq $32, %%mm2				\n\t"\
-		"punpckldq 4(%1), %%mm2				\n\t" /* (%1) */\
+#define HDC_CHECK_AND_CPY(src, dst) \
+		"movd " #src ", %%mm2				\n\t"\
+		"punpckldq 4" #src ", %%mm2				\n\t" /* (%1) */\
 		"movq %%mm2, %%mm1				\n\t"\
 		"psrlq $8, %%mm2				\n\t"\
 		"psubb %%mm1, %%mm2				\n\t"\
 		"paddb %%mm7, %%mm2				\n\t"\
 		"pcmpgtb %%mm6, %%mm2				\n\t"\
 		"paddb %%mm2, %%mm0				\n\t"\
-		"movq %%mm1," #i "(%%eax)			\n\t"
-
-		HDC_CHECK_AND_CPY(0)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(8)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(16)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(24)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(32)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(40)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(48)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(56)
-
+		"movq %%mm1," #dst "(%%eax)			\n\t"
+
+		HDC_CHECK_AND_CPY((%1),0)
+		HDC_CHECK_AND_CPY((%%ecx),8)
+		HDC_CHECK_AND_CPY((%%ecx, %2),16)
+		HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
+		HDC_CHECK_AND_CPY((%1, %2, 4),32)
+		HDC_CHECK_AND_CPY((%%ebx),40)
+		HDC_CHECK_AND_CPY((%%ebx, %2),48)
+		HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
+		"jmp 2f						\n\t"
+		"1:						\n\t"
+// src does not cross a 32 byte cache line so dont waste time with alignment
+#define HDC_CHECK_AND_CPY2(src, dst) \
+		"movq " #src ", %%mm2				\n\t"\
+		"movq " #src ", %%mm1				\n\t"\
+		"psrlq $8, %%mm2				\n\t"\
+		"psubb %%mm1, %%mm2				\n\t"\
+		"paddb %%mm7, %%mm2				\n\t"\
+		"pcmpgtb %%mm6, %%mm2				\n\t"\
+		"paddb %%mm2, %%mm0				\n\t"\
+		"movq %%mm1," #dst "(%%eax)			\n\t"
+
+		HDC_CHECK_AND_CPY2((%1),0)
+		HDC_CHECK_AND_CPY2((%%ecx),8)
+		HDC_CHECK_AND_CPY2((%%ecx, %2),16)
+		HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
+		HDC_CHECK_AND_CPY2((%1, %2, 4),32)
+		HDC_CHECK_AND_CPY2((%%ebx),40)
+		HDC_CHECK_AND_CPY2((%%ebx, %2),48)
+		HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
+		"2:						\n\t"
 		"psllq $8, %%mm0				\n\t" // remove dummy value
 		"movq %%mm0, %%mm1				\n\t"
 		"psrlw $8, %%mm0				\n\t"
@@ -1347,14 +1367,13 @@ asm volatile (
 		"movq %%mm0, %%mm1				\n\t"
 		"psrlq $32, %%mm0				\n\t"
 		"paddb %%mm1, %%mm0				\n\t"
-		"popl %1\n\t"
 		"movd %%mm0, %0					\n\t"
 		: "=r" (numEq)
 		: "r" (src), "r" (stride)
-		: "%eax"
+		: "%eax", "%ebx", "%ecx"
 		);
 //	printf("%d\n", numEq);
-	numEq= (256 - (numEq & 0xFF)) &0xFF;
+	numEq= (256 - numEq) &0xFF;
 #else
 	int y;
 	for(y=0; y<BLOCK_SIZE; y++)
diff --git a/postproc/postprocess_template.c b/postproc/postprocess_template.c
index b32d3fe..5ed2477 100644
--- a/postproc/postprocess_template.c
+++ b/postproc/postprocess_template.c
@@ -336,7 +336,7 @@ static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
 		: "=r" (isOk)
 		: "r" (src), "r" (stride)
 		);
-	return isOk ? 1 : 0;
+	return isOk;
 #else
 
 	int isOk2= 1;
@@ -1303,40 +1303,60 @@ static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
 #ifdef HAVE_MMX
 asm volatile (
 //		"int $3 \n\t"
-		"pushl %1\n\t"
+		"leal (%1, %2), %%ecx				\n\t"
+		"leal (%%ecx, %2, 4), %%ebx			\n\t"
+//	0	1	2	3	4	5	6	7	8	9
+//	%1	ecx	ecx+%2	ecx+2%2	%1+4%2	ebx	ebx+%2	ebx+2%2	%1+8%2	ebx+4%2
 		"movq b7E, %%mm7				\n\t" // mm7 = 0x7F
 		"movq b7C, %%mm6				\n\t" // mm6 = 0x7D
-		"leal tempBlock, %%eax				\n\t"
 		"pxor %%mm0, %%mm0				\n\t"
+		"movl %1, %%eax					\n\t"
+		"andl $0x1F, %%eax				\n\t"
+		"cmpl $24, %%eax				\n\t"
+		"leal tempBlock, %%eax				\n\t"
+		"jb 1f						\n\t"
 
-#define HDC_CHECK_AND_CPY(i) \
-		"movq -4(%1), %%mm2				\n\t"\
-		"psrlq $32, %%mm2				\n\t"\
-		"punpckldq 4(%1), %%mm2				\n\t" /* (%1) */\
+#define HDC_CHECK_AND_CPY(src, dst) \
+		"movd " #src ", %%mm2				\n\t"\
+		"punpckldq 4" #src ", %%mm2				\n\t" /* (%1) */\
 		"movq %%mm2, %%mm1				\n\t"\
 		"psrlq $8, %%mm2				\n\t"\
 		"psubb %%mm1, %%mm2				\n\t"\
 		"paddb %%mm7, %%mm2				\n\t"\
 		"pcmpgtb %%mm6, %%mm2				\n\t"\
 		"paddb %%mm2, %%mm0				\n\t"\
-		"movq %%mm1," #i "(%%eax)			\n\t"
-
-		HDC_CHECK_AND_CPY(0)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(8)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(16)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(24)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(32)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(40)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(48)
-		"addl %2, %1					\n\t"
-		HDC_CHECK_AND_CPY(56)
-
+		"movq %%mm1," #dst "(%%eax)			\n\t"
+
+		HDC_CHECK_AND_CPY((%1),0)
+		HDC_CHECK_AND_CPY((%%ecx),8)
+		HDC_CHECK_AND_CPY((%%ecx, %2),16)
+		HDC_CHECK_AND_CPY((%%ecx, %2, 2),24)
+		HDC_CHECK_AND_CPY((%1, %2, 4),32)
+		HDC_CHECK_AND_CPY((%%ebx),40)
+		HDC_CHECK_AND_CPY((%%ebx, %2),48)
+		HDC_CHECK_AND_CPY((%%ebx, %2, 2),56)
+		"jmp 2f						\n\t"
+		"1:						\n\t"
+// src does not cross a 32 byte cache line so dont waste time with alignment
+#define HDC_CHECK_AND_CPY2(src, dst) \
+		"movq " #src ", %%mm2				\n\t"\
+		"movq " #src ", %%mm1				\n\t"\
+		"psrlq $8, %%mm2				\n\t"\
+		"psubb %%mm1, %%mm2				\n\t"\
+		"paddb %%mm7, %%mm2				\n\t"\
+		"pcmpgtb %%mm6, %%mm2				\n\t"\
+		"paddb %%mm2, %%mm0				\n\t"\
+		"movq %%mm1," #dst "(%%eax)			\n\t"
+
+		HDC_CHECK_AND_CPY2((%1),0)
+		HDC_CHECK_AND_CPY2((%%ecx),8)
+		HDC_CHECK_AND_CPY2((%%ecx, %2),16)
+		HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24)
+		HDC_CHECK_AND_CPY2((%1, %2, 4),32)
+		HDC_CHECK_AND_CPY2((%%ebx),40)
+		HDC_CHECK_AND_CPY2((%%ebx, %2),48)
+		HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56)
+		"2:						\n\t"
 		"psllq $8, %%mm0				\n\t" // remove dummy value
 		"movq %%mm0, %%mm1				\n\t"
 		"psrlw $8, %%mm0				\n\t"
@@ -1347,14 +1367,13 @@ asm volatile (
 		"movq %%mm0, %%mm1				\n\t"
 		"psrlq $32, %%mm0				\n\t"
 		"paddb %%mm1, %%mm0				\n\t"
-		"popl %1\n\t"
 		"movd %%mm0, %0					\n\t"
 		: "=r" (numEq)
 		: "r" (src), "r" (stride)
-		: "%eax"
+		: "%eax", "%ebx", "%ecx"
 		);
 //	printf("%d\n", numEq);
-	numEq= (256 - (numEq & 0xFF)) &0xFF;
+	numEq= (256 - numEq) &0xFF;
 #else
 	int y;
 	for(y=0; y<BLOCK_SIZE; y++)

-- 
Libav/FFmpeg packaging



More information about the pkg-multimedia-commits mailing list