[SCM] libav/experimental: Unroll loop in h264_idct_add8_sse2(). This means we can inline scan8[] in the code directly also and remove loop setup. 20% faster in function, 0.8% overall.

Sun Jun 30 17:16:06 UTC 2013

The following commit has been merged in the experimental branch:
commit 4bca677494c7d39753cfbc356e1b8c3988024d16
Author: Ronald S. Bultje <rsbultje at gmail.com>
Date:   Fri Sep 24 14:05:45 2010 +0000

    Unroll loop in h264_idct_add8_sse2(). This means we can inline scan8[] in the
    code directly also and remove loop setup. 20% faster in function, 0.8% overall.
    
    See "[PATCH] unroll loop in h264_idct_add8_sse2()" thread on ML.
    
    Originally committed as revision 25171 to svn://svn.ffmpeg.org/ffmpeg/trunk

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 3311ab5..7af6952 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -804,62 +804,53 @@ cglobal h264_idct_add16intra_sse2, 5, 7, 8
     jl .next2blocks
     REP_RET
 
-h264_idct_add8_sse2_plane:
-.next2blocks
-    movzx       r0, byte [scan8+r5]
-    movzx       r0, word [r4+r0]
+%macro add8_sse2_cycle 2
+    movzx       r0, word [r4+%2]
     test        r0, r0
-    jz .try_dc
+    jz .try%1dc
 %ifdef ARCH_X86_64
-    mov        r0d, dword [r1+r5*4]
+    mov        r0d, dword [r1+%1*8+64]
     add         r0, [r10]
 %else
-    mov         r0, r1m ; XXX r1m here is actually r0m of the calling func
+    mov         r0, r0m
     mov         r0, [r0]
-    add         r0, dword [r1+r5*4]
+    add         r0, dword [r1+%1*8+64]
 %endif
     call        x264_add8x4_idct_sse2
-    add         r5, 2
-    add         r2, 64
-    test        r5, 3
-    jnz .next2blocks
-    rep ret
-.try_dc
+    jmp .cycle%1end
+.try%1dc
     movsx       r0, word [r2   ]
     or         r0w, word [r2+32]
-    jz .skip2blocks
+    jz .cycle%1end
 %ifdef ARCH_X86_64
-    mov        r0d, dword [r1+r5*4]
+    mov        r0d, dword [r1+%1*8+64]
     add         r0, [r10]
 %else
-    mov         r0, r1m ; XXX r1m here is actually r0m of the calling func
+    mov         r0, r0m
     mov         r0, [r0]
-    add         r0, dword [r1+r5*4]
+    add         r0, dword [r1+%1*8+64]
 %endif
     call        h264_idct_dc_add8_mmx2
-.skip2blocks
-    add         r5, 2
+.cycle%1end
+%if %1 < 3
     add         r2, 64
-    test        r5, 3
-    jnz .next2blocks
-    rep ret
+%endif
+%endmacro
 
 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
 ;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
 cglobal h264_idct_add8_sse2, 5, 7, 8
-    mov          r5, 16
     add          r2, 512
-%ifdef PIC
-    lea        r11, [scan8_mem]
-%endif
 %ifdef ARCH_X86_64
     mov         r10, r0
 %endif
-    call h264_idct_add8_sse2_plane
+    add8_sse2_cycle 0, 0x09
+    add8_sse2_cycle 1, 0x11
 %ifdef ARCH_X86_64
     add         r10, gprsize
 %else
     add        r0mp, gprsize
 %endif
-    call h264_idct_add8_sse2_plane
+    add8_sse2_cycle 2, 0x21
+    add8_sse2_cycle 3, 0x29
     RET

-- 
Libav/FFmpeg packaging