[SCM] libav/experimental: avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum (patch for plain mmx support is welcome ...)
siretart at users.alioth.debian.org
siretart at users.alioth.debian.org
Sun Jun 30 16:06:01 UTC 2013
The following commit has been merged in the experimental branch:
commit 629750290f6122a72e68c34cf94f521a90def2ef
Author: Michael Niedermayer <michaelni at gmx.at>
Date: Sun Aug 26 01:11:02 2007 +0000
avoid overflow in the 3rd lifting step, this now needs mmx2 at minimum
(patch for plain mmx support is welcome ...)
Originally committed as revision 10226 to svn://svn.ffmpeg.org/ffmpeg/trunk
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index 3ebf148..191688c 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -3627,8 +3627,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
}
else{
+ if(mm_flags & MM_MMXEXT){
c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+ }
c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
}
#endif
diff --git a/libavcodec/i386/snowdsp_mmx.c b/libavcodec/i386/snowdsp_mmx.c
index 6deaad2..e7f4b29 100644
--- a/libavcodec/i386/snowdsp_mmx.c
+++ b/libavcodec/i386/snowdsp_mmx.c
@@ -111,22 +111,29 @@ void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){
i = 0;
asm volatile(
- "psllw $2, %%xmm7 \n\t"
+ "psllw $13, %%xmm7 \n\t"
+ "pcmpeqw %%xmm6, %%xmm6 \n\t"
+ "psrlw $13, %%xmm6 \n\t"
+ "paddw %%xmm7, %%xmm6 \n\t"
::);
for(; i<w_l-15; i+=16){
asm volatile(
- "movdqu (%1), %%xmm1 \n\t"
- "movdqu 16(%1), %%xmm5 \n\t"
- "movdqu 2(%1), %%xmm0 \n\t"
- "movdqu 18(%1), %%xmm4 \n\t" //FIXME try aligned reads and shifts
- "paddw %%xmm1, %%xmm0 \n\t"
- "paddw %%xmm5, %%xmm4 \n\t"
- "paddw %%xmm7, %%xmm0 \n\t"
- "paddw %%xmm7, %%xmm4 \n\t"
+ "movdqu (%1), %%xmm0 \n\t"
+ "movdqu 16(%1), %%xmm4 \n\t"
+ "movdqu 2(%1), %%xmm1 \n\t"
+ "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
+ "paddw %%xmm6, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm4 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm5 \n\t"
+ "pavgw %%xmm1, %%xmm0 \n\t"
+ "pavgw %%xmm5, %%xmm4 \n\t"
+ "psubw %%xmm7, %%xmm0 \n\t"
+ "psubw %%xmm7, %%xmm4 \n\t"
+ "psraw $1, %%xmm0 \n\t"
+ "psraw $1, %%xmm4 \n\t"
"movdqa (%0), %%xmm1 \n\t"
"movdqa 16(%0), %%xmm5 \n\t"
- "psraw $2, %%xmm0 \n\t"
- "psraw $2, %%xmm4 \n\t"
"paddw %%xmm1, %%xmm0 \n\t"
"paddw %%xmm5, %%xmm4 \n\t"
"psraw $2, %%xmm0 \n\t"
@@ -288,18 +295,27 @@ void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width){
i = 1;
b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
asm volatile(
- "psllw $2, %%mm7 \n\t"
+ "psllw $13, %%mm7 \n\t"
+ "pcmpeqw %%mm6, %%mm6 \n\t"
+ "psrlw $13, %%mm6 \n\t"
+ "paddw %%mm7, %%mm6 \n\t"
::);
for(; i<w_l-7; i+=8){
asm volatile(
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm4 \n\t"
- "paddw 2(%1), %%mm0 \n\t"
- "paddw 10(%1), %%mm4 \n\t"
- "paddw %%mm7, %%mm0 \n\t"
- "paddw %%mm7, %%mm4 \n\t"
- "psraw $2, %%mm0 \n\t"
- "psraw $2, %%mm4 \n\t"
+ "movq 2(%1), %%mm1 \n\t"
+ "movq 10(%1), %%mm5 \n\t"
+ "paddw %%mm6, %%mm0 \n\t"
+ "paddw %%mm6, %%mm4 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ "pavgw %%mm1, %%mm0 \n\t"
+ "pavgw %%mm5, %%mm4 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+ "psubw %%mm7, %%mm4 \n\t"
+ "psraw $1, %%mm0 \n\t"
+ "psraw $1, %%mm4 \n\t"
"movq (%0), %%mm1 \n\t"
"movq 8(%0), %%mm5 \n\t"
"paddw %%mm1, %%mm0 \n\t"
@@ -467,16 +483,31 @@ void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6")
"mov %2, %%"REG_a" \n\t"
- snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
- snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6")
+
+ "pcmpeqw %%xmm7, %%xmm7 \n\t"
+ "pcmpeqw %%xmm5, %%xmm5 \n\t"
+ "psllw $15, %%xmm7 \n\t"
+ "psrlw $13, %%xmm5 \n\t"
+ "paddw %%xmm7, %%xmm5 \n\t"
+ snow_vertical_compose_sse2_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+ "movq (%%"REG_a",%%"REG_d",2), %%xmm1 \n\t"
+ "movq 8(%%"REG_a",%%"REG_d",2), %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "pavgw %%xmm1, %%xmm0 \n\t"
+ "pavgw %%xmm3, %%xmm2 \n\t"
+ "movq 16(%%"REG_a",%%"REG_d",2), %%xmm1 \n\t"
+ "movq 24(%%"REG_a",%%"REG_d",2), %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "pavgw %%xmm1, %%xmm4 \n\t"
+ "pavgw %%xmm3, %%xmm6 \n\t"
+ snow_vertical_compose_sse2_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
- "pcmpeqd %%xmm1, %%xmm1 \n\t"
- "psllw $15, %%xmm1 \n\t"
- "psrlw $14, %%xmm1 \n\t"
"mov %1, %%"REG_S" \n\t"
- snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_sra("2","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6")
@@ -569,16 +600,30 @@ void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, I
snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6")
"mov %2, %%"REG_a" \n\t"
- snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
- snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6")
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "pcmpeqw %%mm5, %%mm5 \n\t"
+ "psllw $15, %%mm7 \n\t"
+ "psrlw $13, %%mm5 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ snow_vertical_compose_mmx_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+ "movq (%%"REG_a",%%"REG_d",2), %%mm1 \n\t"
+ "movq 8(%%"REG_a",%%"REG_d",2), %%mm3 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "pavgw %%mm1, %%mm0 \n\t"
+ "pavgw %%mm3, %%mm2 \n\t"
+ "movq 16(%%"REG_a",%%"REG_d",2), %%mm1 \n\t"
+ "movq 24(%%"REG_a",%%"REG_d",2), %%mm3 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "pavgw %%mm1, %%mm4 \n\t"
+ "pavgw %%mm3, %%mm6 \n\t"
+ snow_vertical_compose_sse2_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
- "pcmpeqw %%mm1, %%mm1 \n\t"
- "psllw $15, %%mm1 \n\t"
- "psrlw $14, %%mm1 \n\t"
"mov %1, %%"REG_S" \n\t"
- snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_sra("2","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")
--
Libav/FFmpeg packaging
More information about the pkg-multimedia-commits
mailing list