[SCM] libav/experimental: dsputil: Move APE-specific bits into apedsp

Sun Aug 10 16:03:20 UTC 2014

The following commit has been merged in the experimental branch:
commit 054013a0fc6f2b52c60cee3e051be8cc7f82cef3
Author: Diego Biurrun <diego at biurrun.de>
Date:   Sun Dec 29 02:32:16 2013 +0100

    dsputil: Move APE-specific bits into apedsp

diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index fb41918..6329295 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -25,6 +25,7 @@
 #include "libavutil/avassert.h"
 #include "libavutil/channel_layout.h"
 #include "libavutil/opt.h"
+#include "apedsp.h"
 #include "avcodec.h"
 #include "dsputil.h"
 #include "bytestream.h"
@@ -136,6 +137,7 @@ typedef struct APEContext {
     AVClass *class;                          ///< class for AVOptions
     AVCodecContext *avctx;
     DSPContext dsp;
+    APEDSPContext adsp;
     int channels;
     int samples;                             ///< samples left to decode in current frame
     int bps;
@@ -195,8 +197,6 @@ static void predictor_decode_stereo_3930(APEContext *ctx, int count);
 static void predictor_decode_mono_3950(APEContext *ctx, int count);
 static void predictor_decode_stereo_3950(APEContext *ctx, int count);
 
-// TODO: dsputilize
-
 static av_cold int ape_decode_close(AVCodecContext *avctx)
 {
     APEContext *s = avctx->priv_data;
@@ -212,6 +212,19 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul)
+{
+    int res = 0;
+
+    while (order--) {
+        res   += *v1 * *v2++;
+        *v1++ += mul * *v3++;
+    }
+    return res;
+}
+
 static av_cold int ape_decode_init(AVCodecContext *avctx)
 {
     APEContext *s = avctx->priv_data;
@@ -292,6 +305,15 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
         s->predictor_decode_stereo = predictor_decode_stereo_3950;
     }
 
+    s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
+
+    if (ARCH_ARM)
+        ff_apedsp_init_arm(&s->adsp);
+    if (ARCH_PPC)
+        ff_apedsp_init_ppc(&s->adsp);
+    if (ARCH_X86)
+        ff_apedsp_init_x86(&s->adsp);
+
     ff_dsputil_init(&s->dsp, avctx);
     avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
 
@@ -1263,9 +1285,10 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
 
     while (count--) {
         /* round fixedpoint scalar product */
-        res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order,
-                                                    f->adaptcoeffs - order,
-                                                    order, APESIGN(*data));
+        res = ctx->adsp.scalarproduct_and_madd_int16(f->coeffs,
+                                                     f->delay - order,
+                                                     f->adaptcoeffs - order,
+                                                     order, APESIGN(*data));
         res = (res + (1 << (fracbits - 1))) >> fracbits;
         res += *data;
         *data++ = res;
diff --git a/libavformat/apetag.h b/libavcodec/apedsp.h
similarity index 54%
copy from libavformat/apetag.h
copy to libavcodec/apedsp.h
index 36e3211..64e2749 100644
--- a/libavformat/apetag.h
+++ b/libavcodec/apedsp.h
@@ -1,5 +1,5 @@
 /*
- * APE tag handling
+ * Monkey's Audio lossless audio decoder
  * Copyright (c) 2007 Benjamin Zores <ben at geexbox.org>
  *  based upon libdemac from Dave Chapman.
  *
@@ -20,21 +20,25 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVFORMAT_APETAG_H
-#define AVFORMAT_APETAG_H
+#ifndef AVCODEC_APEDSP_H
+#define AVCODEC_APEDSP_H
 
-#include "avformat.h"
+#include <stdint.h>
 
-/**
- * Read and parse an APE tag
- *
- * @return offset of the tag start in the file
- */
-int64_t ff_ape_parse_tag(AVFormatContext *s);
+typedef struct APEDSPContext {
+    /**
+     * Calculate scalar product of v1 and v2,
+     * and v1[i] += v3[i] * mul
+     * @param len length of vectors, should be multiple of 16
+     */
+    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
+                                            const int16_t *v2,
+                                            const int16_t *v3,
+                                            int len, int mul);
+} APEDSPContext;
 
-/**
- * Write an APE tag into a file.
- */
-int ff_ape_write_tag(AVFormatContext *s);
+void ff_apedsp_init_arm(APEDSPContext *c);
+void ff_apedsp_init_ppc(APEDSPContext *c);
+void ff_apedsp_init_x86(APEDSPContext *c);
 
-#endif /* AVFORMAT_APETAG_H */
+#endif /* AVCODEC_APEDSP_H */
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 9d5b6aa..13025af 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -24,6 +24,7 @@ OBJS-$(CONFIG_VP3DSP)                  += arm/vp3dsp_init_arm.o
 
 OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
                                           arm/sbrdsp_init_arm.o
+OBJS-$(CONFIG_APE_DECODER)             += arm/apedsp_init_arm.o
 OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o
 OBJS-$(CONFIG_FLAC_DECODER)            += arm/flacdsp_init_arm.o        \
                                           arm/flacdsp_arm.o
@@ -97,6 +98,7 @@ NEON-OBJS-$(CONFIG_VP3DSP)             += arm/vp3dsp_neon.o
 
 NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
                                           arm/sbrdsp_neon.o
+NEON-OBJS-$(CONFIG_APE_DECODER)        += arm/apedsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
                                           arm/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
diff --git a/libavcodec/arm/vorbisdsp_init_arm.c b/libavcodec/arm/apedsp_init_arm.c
similarity index 70%
copy from libavcodec/arm/vorbisdsp_init_arm.c
copy to libavcodec/arm/apedsp_init_arm.c
index 853ba2d..47ea034 100644
--- a/libavcodec/arm/vorbisdsp_init_arm.c
+++ b/libavcodec/arm/apedsp_init_arm.c
@@ -1,6 +1,5 @@
 /*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ * Copyright (c) 2011 Mans Rullgard <mans at mansr.com>
  *
  * This file is part of Libav.
  *
@@ -19,19 +18,21 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <stdint.h>
+
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
-#include "libavcodec/vorbisdsp.h"
+#include "libavcodec/apedsp.h"
 
-void ff_vorbis_inverse_coupling_neon(float *mag, float *ang,
-                                     intptr_t blocksize);
+int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
+                                             const int16_t *v3, int len, int mul);
 
-av_cold void ff_vorbisdsp_init_arm(VorbisDSPContext *c)
+av_cold void ff_apedsp_init_arm(APEDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags)) {
-        c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
     }
 }
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/apedsp_neon.S
similarity index 70%
copy from libavcodec/arm/int_neon.S
copy to libavcodec/arm/apedsp_neon.S
index 3d2faff..7cfbf43 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/apedsp_neon.S
@@ -21,34 +21,6 @@
 
 #include "libavutil/arm/asm.S"
 
-function ff_scalarproduct_int16_neon, export=1
-        vmov.i16        q0,  #0
-        vmov.i16        q1,  #0
-        vmov.i16        q2,  #0
-        vmov.i16        q3,  #0
-1:      vld1.16         {d16-d17}, [r0]!
-        vld1.16         {d20-d21}, [r1,:128]!
-        vmlal.s16       q0,  d16,  d20
-        vld1.16         {d18-d19}, [r0]!
-        vmlal.s16       q1,  d17,  d21
-        vld1.16         {d22-d23}, [r1,:128]!
-        vmlal.s16       q2,  d18,  d22
-        vmlal.s16       q3,  d19,  d23
-        subs            r2,  r2,   #16
-        bne             1b
-
-        vpadd.s32       d16, d0,   d1
-        vpadd.s32       d17, d2,   d3
-        vpadd.s32       d18, d4,   d5
-        vpadd.s32       d19, d6,   d7
-        vpadd.s32       d0,  d16,  d17
-        vpadd.s32       d1,  d18,  d19
-        vpadd.s32       d2,  d0,   d1
-        vpaddl.s32      d3,  d2
-        vmov.32         r0,  d3[0]
-        bx              lr
-endfunc
-
 @ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
 function ff_scalarproduct_and_madd_int16_neon, export=1
         vld1.16         {d28[],d29[]}, [sp]
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 16e052d..c9bdaa5 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -44,9 +44,6 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
 
 int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
 
-int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
-                                             const int16_t *v3, int len, int mul);
-
 av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
                                   unsigned high_bit_depth)
 {
@@ -73,6 +70,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
     c->vector_clip_int32 = ff_vector_clip_int32_neon;
 
     c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
-
-    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
 }
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S
index 3d2faff..42f3739 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -48,43 +48,3 @@ function ff_scalarproduct_int16_neon, export=1
         vmov.32         r0,  d3[0]
         bx              lr
 endfunc
-
-@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
-function ff_scalarproduct_and_madd_int16_neon, export=1
-        vld1.16         {d28[],d29[]}, [sp]
-        vmov.i16        q0,  #0
-        vmov.i16        q1,  #0
-        vmov.i16        q2,  #0
-        vmov.i16        q3,  #0
-        mov             r12, r0
-
-1:      vld1.16         {d16-d17}, [r0,:128]!
-        vld1.16         {d18-d19}, [r1]!
-        vld1.16         {d20-d21}, [r2]!
-        vld1.16         {d22-d23}, [r0,:128]!
-        vld1.16         {d24-d25}, [r1]!
-        vld1.16         {d26-d27}, [r2]!
-        vmul.s16        q10, q10,  q14
-        vmul.s16        q13, q13,  q14
-        vmlal.s16       q0,  d16,  d18
-        vmlal.s16       q1,  d17,  d19
-        vadd.s16        q10, q8,   q10
-        vadd.s16        q13, q11,  q13
-        vmlal.s16       q2,  d22,  d24
-        vmlal.s16       q3,  d23,  d25
-        vst1.16         {q10},     [r12,:128]!
-        subs            r3,  r3,   #16
-        vst1.16         {q13},     [r12,:128]!
-        bne             1b
-
-        vpadd.s32       d16, d0,   d1
-        vpadd.s32       d17, d2,   d3
-        vpadd.s32       d18, d4,   d5
-        vpadd.s32       d19, d6,   d7
-        vpadd.s32       d0,  d16,  d17
-        vpadd.s32       d1,  d18,  d19
-        vpadd.s32       d2,  d0,   d1
-        vpaddl.s32      d3,  d2
-        vmov.32         r0,  d3[0]
-        bx              lr
-endfunc
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 11447c0..6b84658 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2069,19 +2069,6 @@ static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
     return res;
 }
 
-static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
-                                              const int16_t *v3,
-                                              int order, int mul)
-{
-    int res = 0;
-
-    while (order--) {
-        res   += *v1 * *v2++;
-        *v1++ += mul * *v3++;
-    }
-    return res;
-}
-
 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
                                 int32_t max, unsigned int len)
 {
@@ -2294,8 +2281,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
     c->try_8x8basis = try_8x8basis_c;
     c->add_8x8basis = add_8x8basis_c;
 
-    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
-
     c->scalarproduct_int16 = scalarproduct_int16_c;
     c->vector_clip_int32   = vector_clip_int32_c;
     c->vector_clipf        = vector_clipf_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index d261f7e..471988b 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -255,16 +255,6 @@ typedef struct DSPContext {
      */
     int32_t (*scalarproduct_int16)(const int16_t *v1,
                                    const int16_t *v2 /* align 16 */, int len);
-    /* ape functions */
-    /**
-     * Calculate scalar product of v1 and v2,
-     * and v1[i] += v3[i] * mul
-     * @param len length of vectors, should be multiple of 16
-     */
-    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
-                                            const int16_t *v2,
-                                            const int16_t *v3,
-                                            int len, int mul);
 
     /**
      * Clip each element in an array of int32_t to a given minimum and
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index ec0674c..b78d4be 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o
 OBJS-$(CONFIG_VIDEODSP)                += ppc/videodsp_ppc.o
 OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o
 
+OBJS-$(CONFIG_APE_DECODER)             += ppc/apedsp_altivec.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += ppc/svq1enc_altivec.o
 OBJS-$(CONFIG_VC1_DECODER)             += ppc/vc1dsp_altivec.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += ppc/vorbisdsp_altivec.o
diff --git a/libavcodec/ppc/int_altivec.c b/libavcodec/ppc/apedsp_altivec.c
similarity index 74%
copy from libavcodec/ppc/int_altivec.c
copy to libavcodec/ppc/apedsp_altivec.c
index fa3cb66..de9df45 100644
--- a/libavcodec/ppc/int_altivec.c
+++ b/libavcodec/ppc/apedsp_altivec.c
@@ -18,11 +18,6 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/**
- * @file
- * miscellaneous integer operations
- */
-
 #include "config.h"
 #if HAVE_ALTIVEC_H
 #include <altivec.h>
@@ -30,32 +25,9 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/ppc/types_altivec.h"
-#include "libavutil/ppc/util_altivec.h"
-#include "libavcodec/dsputil.h"
-#include "dsputil_altivec.h"
-
-static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
-                                           int order)
-{
-    int i;
-    LOAD_ZERO;
-    register vec_s16 vec1;
-    register vec_s32 res = vec_splat_s32(0), t;
-    int32_t ires;
-
-    for (i = 0; i < order; i += 8) {
-        vec1 = vec_unaligned_load(v1);
-        t    = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
-        res  = vec_sums(t, res);
-        v1  += 8;
-        v2  += 8;
-    }
-    res = vec_splat(res, 3);
-    vec_ste(res, 0, &ires);
-
-    return ires;
-}
+#include "libavcodec/apedsp.h"
 
+#if HAVE_ALTIVEC
 static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
                                                     const int16_t *v2,
                                                     const int16_t *v3,
@@ -95,10 +67,11 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
 
     return ires;
 }
+#endif /* HAVE_ALTIVEC */
 
-av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
+av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
 {
-    c->scalarproduct_int16 = scalarproduct_int16_altivec;
-
+#if HAVE_ALTIVEC
     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
+#endif /* HAVE_ALTIVEC */
 }
diff --git a/libavcodec/ppc/int_altivec.c b/libavcodec/ppc/int_altivec.c
index fa3cb66..d76d34a 100644
--- a/libavcodec/ppc/int_altivec.c
+++ b/libavcodec/ppc/int_altivec.c
@@ -56,49 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
     return ires;
 }
 
-static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
-                                                    const int16_t *v2,
-                                                    const int16_t *v3,
-                                                    int order, int mul)
-{
-    LOAD_ZERO;
-    vec_s16 *pv1 = (vec_s16 *) v1;
-    register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
-    register vec_s16 t0, t1, i0, i1, i4;
-    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
-    register vec_s32 res = zero_s32v;
-    register vec_u8 align = vec_lvsl(0, v2);
-    int32_t ires;
-
-    order >>= 4;
-    do {
-        i1     = vec_ld(16, v2);
-        t0     = vec_perm(i2, i1, align);
-        i2     = vec_ld(32, v2);
-        t1     = vec_perm(i1, i2, align);
-        i0     = pv1[0];
-        i1     = pv1[1];
-        res    = vec_msum(t0, i0, res);
-        res    = vec_msum(t1, i1, res);
-        i4     = vec_ld(16, v3);
-        t0     = vec_perm(i3, i4, align);
-        i3     = vec_ld(32, v3);
-        t1     = vec_perm(i4, i3, align);
-        pv1[0] = vec_mladd(t0, muls, i0);
-        pv1[1] = vec_mladd(t1, muls, i1);
-        pv1   += 2;
-        v2    += 16;
-        v3    += 16;
-    } while (--order);
-    res = vec_splat(vec_sums(res, zero_s32v), 3);
-    vec_ste(res, 0, &ires);
-
-    return ires;
-}
-
 av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
 {
     c->scalarproduct_int16 = scalarproduct_int16_altivec;
-
-    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
 }
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 8830a22..1024226 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
 OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
 
 OBJS-$(CONFIG_AAC_DECODER)             += x86/sbrdsp_init.o
+OBJS-$(CONFIG_APE_DECODER)             += x86/apedsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
@@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 
 YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
+YASM-OBJS-$(CONFIG_APE_DECODER)        += x86/apedsp.o
 YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/apedsp.asm
new file mode 100644
index 0000000..d721ebd
--- /dev/null
+++ b/libavcodec/x86/apedsp.asm
@@ -0,0 +1,167 @@
+;******************************************************************************
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+%macro SCALARPRODUCT 0
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+;                                     int order, int mul)
+cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+%if mmsize == 16
+    pshuflw m7, m7, 0
+    punpcklqdq m7, m7
+%else
+    pshufw  m7, m7, 0
+%endif
+    pxor    m6, m6
+    add v1q, orderq
+    add v2q, orderq
+    add v3q, orderq
+    neg orderq
+.loop:
+    movu    m0, [v2q + orderq]
+    movu    m1, [v2q + orderq + mmsize]
+    mova    m4, [v1q + orderq]
+    mova    m5, [v1q + orderq + mmsize]
+    movu    m2, [v3q + orderq]
+    movu    m3, [v3q + orderq + mmsize]
+    pmaddwd m0, m4
+    pmaddwd m1, m5
+    pmullw  m2, m7
+    pmullw  m3, m7
+    paddd   m6, m0
+    paddd   m6, m1
+    paddw   m2, m4
+    paddw   m3, m5
+    mova    [v1q + orderq], m2
+    mova    [v1q + orderq + mmsize], m3
+    add     orderq, mmsize*2
+    jl .loop
+%if mmsize == 16
+    movhlps m0, m6
+    paddd   m6, m0
+    pshuflw m0, m6, 0x4e
+%else
+    pshufw  m0, m6, 0x4e
+%endif
+    paddd   m6, m0
+    movd   eax, m6
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SCALARPRODUCT
+INIT_XMM sse2
+SCALARPRODUCT
+
+%macro SCALARPRODUCT_LOOP 1
+align 16
+.loop%1:
+    sub     orderq, mmsize*2
+%if %1
+    mova    m1, m4
+    mova    m4, [v2q + orderq]
+    mova    m0, [v2q + orderq + mmsize]
+    palignr m1, m0, %1
+    palignr m0, m4, %1
+    mova    m3, m5
+    mova    m5, [v3q + orderq]
+    mova    m2, [v3q + orderq + mmsize]
+    palignr m3, m2, %1
+    palignr m2, m5, %1
+%else
+    mova    m0, [v2q + orderq]
+    mova    m1, [v2q + orderq + mmsize]
+    mova    m2, [v3q + orderq]
+    mova    m3, [v3q + orderq + mmsize]
+%endif
+    %define t0  [v1q + orderq]
+    %define t1  [v1q + orderq + mmsize]
+%if ARCH_X86_64
+    mova    m8, t0
+    mova    m9, t1
+    %define t0  m8
+    %define t1  m9
+%endif
+    pmaddwd m0, t0
+    pmaddwd m1, t1
+    pmullw  m2, m7
+    pmullw  m3, m7
+    paddw   m2, t0
+    paddw   m3, t1
+    paddd   m6, m0
+    paddd   m6, m1
+    mova    [v1q + orderq], m2
+    mova    [v1q + orderq + mmsize], m3
+    jg .loop%1
+%if %1
+    jmp .end
+%endif
+%endmacro
+
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+;                                     int order, int mul)
+INIT_XMM ssse3
+cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+    pshuflw m7, m7, 0
+    punpcklqdq m7, m7
+    pxor    m6, m6
+    mov    r4d, v2d
+    and    r4d, 15
+    and    v2q, ~15
+    and    v3q, ~15
+    mova    m4, [v2q + orderq]
+    mova    m5, [v3q + orderq]
+    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
+    cmp    r4d, 0
+    je .loop0
+    cmp    r4d, 2
+    je .loop2
+    cmp    r4d, 4
+    je .loop4
+    cmp    r4d, 6
+    je .loop6
+    cmp    r4d, 8
+    je .loop8
+    cmp    r4d, 10
+    je .loop10
+    cmp    r4d, 12
+    je .loop12
+SCALARPRODUCT_LOOP 14
+SCALARPRODUCT_LOOP 12
+SCALARPRODUCT_LOOP 10
+SCALARPRODUCT_LOOP 8
+SCALARPRODUCT_LOOP 6
+SCALARPRODUCT_LOOP 4
+SCALARPRODUCT_LOOP 2
+SCALARPRODUCT_LOOP 0
+.end:
+    movhlps m0, m6
+    paddd   m6, m0
+    pshuflw m0, m6, 0x4e
+    paddd   m6, m0
+    movd   eax, m6
+    RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/apedsp_init.c
new file mode 100644
index 0000000..f692c2b
--- /dev/null
+++ b/libavcodec/x86/apedsp_init.c
@@ -0,0 +1,47 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/apedsp.h"
+
+int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
+                                               const int16_t *v3,
+                                               int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
+                                             const int16_t *v3,
+                                             int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul);
+
+av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+
+    if (EXTERNAL_SSSE3(cpu_flags) &&
+        !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+}
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 684f09b..b5d6d3c 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
     paddd   m2, m0
     movd   eax, m2
     RET
-
-; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
-;                                     int order, int mul)
-cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
-    shl orderq, 1
-    movd    m7, mulm
-%if mmsize == 16
-    pshuflw m7, m7, 0
-    punpcklqdq m7, m7
-%else
-    pshufw  m7, m7, 0
-%endif
-    pxor    m6, m6
-    add v1q, orderq
-    add v2q, orderq
-    add v3q, orderq
-    neg orderq
-.loop:
-    movu    m0, [v2q + orderq]
-    movu    m1, [v2q + orderq + mmsize]
-    mova    m4, [v1q + orderq]
-    mova    m5, [v1q + orderq + mmsize]
-    movu    m2, [v3q + orderq]
-    movu    m3, [v3q + orderq + mmsize]
-    pmaddwd m0, m4
-    pmaddwd m1, m5
-    pmullw  m2, m7
-    pmullw  m3, m7
-    paddd   m6, m0
-    paddd   m6, m1
-    paddw   m2, m4
-    paddw   m3, m5
-    mova    [v1q + orderq], m2
-    mova    [v1q + orderq + mmsize], m3
-    add     orderq, mmsize*2
-    jl .loop
-%if mmsize == 16
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-%else
-    pshufw  m0, m6, 0x4e
-%endif
-    paddd   m6, m0
-    movd   eax, m6
-    RET
 %endmacro
 
 INIT_MMX mmxext
@@ -106,97 +60,6 @@ SCALARPRODUCT
 INIT_XMM sse2
 SCALARPRODUCT
 
-%macro SCALARPRODUCT_LOOP 1
-align 16
-.loop%1:
-    sub     orderq, mmsize*2
-%if %1
-    mova    m1, m4
-    mova    m4, [v2q + orderq]
-    mova    m0, [v2q + orderq + mmsize]
-    palignr m1, m0, %1
-    palignr m0, m4, %1
-    mova    m3, m5
-    mova    m5, [v3q + orderq]
-    mova    m2, [v3q + orderq + mmsize]
-    palignr m3, m2, %1
-    palignr m2, m5, %1
-%else
-    mova    m0, [v2q + orderq]
-    mova    m1, [v2q + orderq + mmsize]
-    mova    m2, [v3q + orderq]
-    mova    m3, [v3q + orderq + mmsize]
-%endif
-    %define t0  [v1q + orderq]
-    %define t1  [v1q + orderq + mmsize]
-%if ARCH_X86_64
-    mova    m8, t0
-    mova    m9, t1
-    %define t0  m8
-    %define t1  m9
-%endif
-    pmaddwd m0, t0
-    pmaddwd m1, t1
-    pmullw  m2, m7
-    pmullw  m3, m7
-    paddw   m2, t0
-    paddw   m3, t1
-    paddd   m6, m0
-    paddd   m6, m1
-    mova    [v1q + orderq], m2
-    mova    [v1q + orderq + mmsize], m3
-    jg .loop%1
-%if %1
-    jmp .end
-%endif
-%endmacro
-
-; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
-;                                     int order, int mul)
-INIT_XMM ssse3
-cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
-    shl orderq, 1
-    movd    m7, mulm
-    pshuflw m7, m7, 0
-    punpcklqdq m7, m7
-    pxor    m6, m6
-    mov    r4d, v2d
-    and    r4d, 15
-    and    v2q, ~15
-    and    v3q, ~15
-    mova    m4, [v2q + orderq]
-    mova    m5, [v3q + orderq]
-    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
-    cmp    r4d, 0
-    je .loop0
-    cmp    r4d, 2
-    je .loop2
-    cmp    r4d, 4
-    je .loop4
-    cmp    r4d, 6
-    je .loop6
-    cmp    r4d, 8
-    je .loop8
-    cmp    r4d, 10
-    je .loop10
-    cmp    r4d, 12
-    je .loop12
-SCALARPRODUCT_LOOP 14
-SCALARPRODUCT_LOOP 12
-SCALARPRODUCT_LOOP 10
-SCALARPRODUCT_LOOP 8
-SCALARPRODUCT_LOOP 6
-SCALARPRODUCT_LOOP 4
-SCALARPRODUCT_LOOP 2
-SCALARPRODUCT_LOOP 0
-.end:
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-    paddd   m6, m0
-    movd   eax, m6
-    RET
-
 
 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 10fa166..9b0788f 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -76,15 +76,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
                                       int order);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
                                     int order);
-int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
-                                               const int16_t *v3,
-                                               int order, int mul);
-int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
-                                             const int16_t *v3,
-                                             int order, int mul);
-int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
-                                              const int16_t *v3,
-                                              int order, int mul);
 
 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
@@ -568,7 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
 
     c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
-    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }
 
@@ -607,7 +597,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
 
 #if HAVE_SSE2_EXTERNAL
     c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
-    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
     if (cpu_flags & AV_CPU_FLAG_ATOM) {
         c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
     } else {
@@ -621,8 +610,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
                                        int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_SSSE3_EXTERNAL
-    if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
-        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
     c->bswap_buf = ff_bswap32_buf_ssse3;
 #endif /* HAVE_SSSE3_EXTERNAL */
 }

-- 
Libav/FFmpeg packaging