[SCM] libav/experimental: dsputil: Move APE-specific bits into apedsp
siretart at users.alioth.debian.org
siretart at users.alioth.debian.org
Sun Aug 10 16:03:20 UTC 2014
The following commit has been merged in the experimental branch:
commit 054013a0fc6f2b52c60cee3e051be8cc7f82cef3
Author: Diego Biurrun <diego at biurrun.de>
Date: Sun Dec 29 02:32:16 2013 +0100
dsputil: Move APE-specific bits into apedsp
diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index fb41918..6329295 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -25,6 +25,7 @@
#include "libavutil/avassert.h"
#include "libavutil/channel_layout.h"
#include "libavutil/opt.h"
+#include "apedsp.h"
#include "avcodec.h"
#include "dsputil.h"
#include "bytestream.h"
@@ -136,6 +137,7 @@ typedef struct APEContext {
AVClass *class; ///< class for AVOptions
AVCodecContext *avctx;
DSPContext dsp;
+ APEDSPContext adsp;
int channels;
int samples; ///< samples left to decode in current frame
int bps;
@@ -195,8 +197,6 @@ static void predictor_decode_stereo_3930(APEContext *ctx, int count);
static void predictor_decode_mono_3950(APEContext *ctx, int count);
static void predictor_decode_stereo_3950(APEContext *ctx, int count);
-// TODO: dsputilize
-
static av_cold int ape_decode_close(AVCodecContext *avctx)
{
APEContext *s = avctx->priv_data;
@@ -212,6 +212,19 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
return 0;
}
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul)
+{
+ int res = 0;
+
+ while (order--) {
+ res += *v1 * *v2++;
+ *v1++ += mul * *v3++;
+ }
+ return res;
+}
+
static av_cold int ape_decode_init(AVCodecContext *avctx)
{
APEContext *s = avctx->priv_data;
@@ -292,6 +305,15 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
s->predictor_decode_stereo = predictor_decode_stereo_3950;
}
+ s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
+
+ if (ARCH_ARM)
+ ff_apedsp_init_arm(&s->adsp);
+ if (ARCH_PPC)
+ ff_apedsp_init_ppc(&s->adsp);
+ if (ARCH_X86)
+ ff_apedsp_init_x86(&s->adsp);
+
ff_dsputil_init(&s->dsp, avctx);
avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
@@ -1263,9 +1285,10 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
while (count--) {
/* round fixedpoint scalar product */
- res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order,
- f->adaptcoeffs - order,
- order, APESIGN(*data));
+ res = ctx->adsp.scalarproduct_and_madd_int16(f->coeffs,
+ f->delay - order,
+ f->adaptcoeffs - order,
+ order, APESIGN(*data));
res = (res + (1 << (fracbits - 1))) >> fracbits;
res += *data;
*data++ = res;
diff --git a/libavformat/apetag.h b/libavcodec/apedsp.h
similarity index 54%
copy from libavformat/apetag.h
copy to libavcodec/apedsp.h
index 36e3211..64e2749 100644
--- a/libavformat/apetag.h
+++ b/libavcodec/apedsp.h
@@ -1,5 +1,5 @@
/*
- * APE tag handling
+ * Monkey's Audio lossless audio decoder
* Copyright (c) 2007 Benjamin Zores <ben at geexbox.org>
* based upon libdemac from Dave Chapman.
*
@@ -20,21 +20,25 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef AVFORMAT_APETAG_H
-#define AVFORMAT_APETAG_H
+#ifndef AVCODEC_APEDSP_H
+#define AVCODEC_APEDSP_H
-#include "avformat.h"
+#include <stdint.h>
-/**
- * Read and parse an APE tag
- *
- * @return offset of the tag start in the file
- */
-int64_t ff_ape_parse_tag(AVFormatContext *s);
+typedef struct APEDSPContext {
+ /**
+ * Calculate scalar product of v1 and v2,
+ * and v1[i] += v3[i] * mul
+ * @param len length of vectors, should be multiple of 16
+ */
+ int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
+ const int16_t *v2,
+ const int16_t *v3,
+ int len, int mul);
+} APEDSPContext;
-/**
- * Write an APE tag into a file.
- */
-int ff_ape_write_tag(AVFormatContext *s);
+void ff_apedsp_init_arm(APEDSPContext *c);
+void ff_apedsp_init_ppc(APEDSPContext *c);
+void ff_apedsp_init_x86(APEDSPContext *c);
-#endif /* AVFORMAT_APETAG_H */
+#endif /* AVCODEC_APEDSP_H */
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 9d5b6aa..13025af 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -24,6 +24,7 @@ OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
arm/sbrdsp_init_arm.o
+OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_init_arm.o
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \
arm/flacdsp_arm.o
@@ -97,6 +98,7 @@ NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
arm/sbrdsp_neon.o
+NEON-OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
diff --git a/libavcodec/arm/vorbisdsp_init_arm.c b/libavcodec/arm/apedsp_init_arm.c
similarity index 70%
copy from libavcodec/arm/vorbisdsp_init_arm.c
copy to libavcodec/arm/apedsp_init_arm.c
index 853ba2d..47ea034 100644
--- a/libavcodec/arm/vorbisdsp_init_arm.c
+++ b/libavcodec/arm/apedsp_init_arm.c
@@ -1,6 +1,5 @@
/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ * Copyright (c) 2011 Mans Rullgard <mans at mansr.com>
*
* This file is part of Libav.
*
@@ -19,19 +18,21 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <stdint.h>
+
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
-#include "libavcodec/vorbisdsp.h"
+#include "libavcodec/apedsp.h"
-void ff_vorbis_inverse_coupling_neon(float *mag, float *ang,
- intptr_t blocksize);
+int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
+ const int16_t *v3, int len, int mul);
-av_cold void ff_vorbisdsp_init_arm(VorbisDSPContext *c)
+av_cold void ff_apedsp_init_arm(APEDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
- c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
}
}
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/apedsp_neon.S
similarity index 70%
copy from libavcodec/arm/int_neon.S
copy to libavcodec/arm/apedsp_neon.S
index 3d2faff..7cfbf43 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/apedsp_neon.S
@@ -21,34 +21,6 @@
#include "libavutil/arm/asm.S"
-function ff_scalarproduct_int16_neon, export=1
- vmov.i16 q0, #0
- vmov.i16 q1, #0
- vmov.i16 q2, #0
- vmov.i16 q3, #0
-1: vld1.16 {d16-d17}, [r0]!
- vld1.16 {d20-d21}, [r1,:128]!
- vmlal.s16 q0, d16, d20
- vld1.16 {d18-d19}, [r0]!
- vmlal.s16 q1, d17, d21
- vld1.16 {d22-d23}, [r1,:128]!
- vmlal.s16 q2, d18, d22
- vmlal.s16 q3, d19, d23
- subs r2, r2, #16
- bne 1b
-
- vpadd.s32 d16, d0, d1
- vpadd.s32 d17, d2, d3
- vpadd.s32 d18, d4, d5
- vpadd.s32 d19, d6, d7
- vpadd.s32 d0, d16, d17
- vpadd.s32 d1, d18, d19
- vpadd.s32 d2, d0, d1
- vpaddl.s32 d3, d2
- vmov.32 r0, d3[0]
- bx lr
-endfunc
-
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
function ff_scalarproduct_and_madd_int16_neon, export=1
vld1.16 {d28[],d29[]}, [sp]
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 16e052d..c9bdaa5 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -44,9 +44,6 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
-int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
- const int16_t *v3, int len, int mul);
-
av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
@@ -73,6 +70,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
c->vector_clip_int32 = ff_vector_clip_int32_neon;
c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
-
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
}
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S
index 3d2faff..42f3739 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -48,43 +48,3 @@ function ff_scalarproduct_int16_neon, export=1
vmov.32 r0, d3[0]
bx lr
endfunc
-
-@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
-function ff_scalarproduct_and_madd_int16_neon, export=1
- vld1.16 {d28[],d29[]}, [sp]
- vmov.i16 q0, #0
- vmov.i16 q1, #0
- vmov.i16 q2, #0
- vmov.i16 q3, #0
- mov r12, r0
-
-1: vld1.16 {d16-d17}, [r0,:128]!
- vld1.16 {d18-d19}, [r1]!
- vld1.16 {d20-d21}, [r2]!
- vld1.16 {d22-d23}, [r0,:128]!
- vld1.16 {d24-d25}, [r1]!
- vld1.16 {d26-d27}, [r2]!
- vmul.s16 q10, q10, q14
- vmul.s16 q13, q13, q14
- vmlal.s16 q0, d16, d18
- vmlal.s16 q1, d17, d19
- vadd.s16 q10, q8, q10
- vadd.s16 q13, q11, q13
- vmlal.s16 q2, d22, d24
- vmlal.s16 q3, d23, d25
- vst1.16 {q10}, [r12,:128]!
- subs r3, r3, #16
- vst1.16 {q13}, [r12,:128]!
- bne 1b
-
- vpadd.s32 d16, d0, d1
- vpadd.s32 d17, d2, d3
- vpadd.s32 d18, d4, d5
- vpadd.s32 d19, d6, d7
- vpadd.s32 d0, d16, d17
- vpadd.s32 d1, d18, d19
- vpadd.s32 d2, d0, d1
- vpaddl.s32 d3, d2
- vmov.32 r0, d3[0]
- bx lr
-endfunc
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 11447c0..6b84658 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2069,19 +2069,6 @@ static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
return res;
}
-static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
- const int16_t *v3,
- int order, int mul)
-{
- int res = 0;
-
- while (order--) {
- res += *v1 * *v2++;
- *v1++ += mul * *v3++;
- }
- return res;
-}
-
static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len)
{
@@ -2294,8 +2281,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
c->try_8x8basis = try_8x8basis_c;
c->add_8x8basis = add_8x8basis_c;
- c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
-
c->scalarproduct_int16 = scalarproduct_int16_c;
c->vector_clip_int32 = vector_clip_int32_c;
c->vector_clipf = vector_clipf_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index d261f7e..471988b 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -255,16 +255,6 @@ typedef struct DSPContext {
*/
int32_t (*scalarproduct_int16)(const int16_t *v1,
const int16_t *v2 /* align 16 */, int len);
- /* ape functions */
- /**
- * Calculate scalar product of v1 and v2,
- * and v1[i] += v3[i] * mul
- * @param len length of vectors, should be multiple of 16
- */
- int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
- const int16_t *v2,
- const int16_t *v3,
- int len, int mul);
/**
* Clip each element in an array of int32_t to a given minimum and
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index ec0674c..b78d4be 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o
OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o
+OBJS-$(CONFIG_APE_DECODER) += ppc/apedsp_altivec.o
OBJS-$(CONFIG_SVQ1_ENCODER) += ppc/svq1enc_altivec.o
OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o
OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o
diff --git a/libavcodec/ppc/int_altivec.c b/libavcodec/ppc/apedsp_altivec.c
similarity index 74%
copy from libavcodec/ppc/int_altivec.c
copy to libavcodec/ppc/apedsp_altivec.c
index fa3cb66..de9df45 100644
--- a/libavcodec/ppc/int_altivec.c
+++ b/libavcodec/ppc/apedsp_altivec.c
@@ -18,11 +18,6 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-/**
- * @file
- * miscellaneous integer operations
- */
-
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
@@ -30,32 +25,9 @@
#include "libavutil/attributes.h"
#include "libavutil/ppc/types_altivec.h"
-#include "libavutil/ppc/util_altivec.h"
-#include "libavcodec/dsputil.h"
-#include "dsputil_altivec.h"
-
-static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
- int order)
-{
- int i;
- LOAD_ZERO;
- register vec_s16 vec1;
- register vec_s32 res = vec_splat_s32(0), t;
- int32_t ires;
-
- for (i = 0; i < order; i += 8) {
- vec1 = vec_unaligned_load(v1);
- t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
- res = vec_sums(t, res);
- v1 += 8;
- v2 += 8;
- }
- res = vec_splat(res, 3);
- vec_ste(res, 0, &ires);
-
- return ires;
-}
+#include "libavcodec/apedsp.h"
+#if HAVE_ALTIVEC
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
const int16_t *v2,
const int16_t *v3,
@@ -95,10 +67,11 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
return ires;
}
+#endif /* HAVE_ALTIVEC */
-av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
+av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
{
- c->scalarproduct_int16 = scalarproduct_int16_altivec;
-
+#if HAVE_ALTIVEC
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
+#endif /* HAVE_ALTIVEC */
}
diff --git a/libavcodec/ppc/int_altivec.c b/libavcodec/ppc/int_altivec.c
index fa3cb66..d76d34a 100644
--- a/libavcodec/ppc/int_altivec.c
+++ b/libavcodec/ppc/int_altivec.c
@@ -56,49 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
return ires;
}
-static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
- const int16_t *v2,
- const int16_t *v3,
- int order, int mul)
-{
- LOAD_ZERO;
- vec_s16 *pv1 = (vec_s16 *) v1;
- register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
- register vec_s16 t0, t1, i0, i1, i4;
- register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
- register vec_s32 res = zero_s32v;
- register vec_u8 align = vec_lvsl(0, v2);
- int32_t ires;
-
- order >>= 4;
- do {
- i1 = vec_ld(16, v2);
- t0 = vec_perm(i2, i1, align);
- i2 = vec_ld(32, v2);
- t1 = vec_perm(i1, i2, align);
- i0 = pv1[0];
- i1 = pv1[1];
- res = vec_msum(t0, i0, res);
- res = vec_msum(t1, i1, res);
- i4 = vec_ld(16, v3);
- t0 = vec_perm(i3, i4, align);
- i3 = vec_ld(32, v3);
- t1 = vec_perm(i4, i3, align);
- pv1[0] = vec_mladd(t0, muls, i0);
- pv1[1] = vec_mladd(t1, muls, i1);
- pv1 += 2;
- v2 += 16;
- v3 += 16;
- } while (--order);
- res = vec_splat(vec_sums(res, zero_s32v), 3);
- vec_ste(res, 0, &ires);
-
- return ires;
-}
-
av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
{
c->scalarproduct_int16 = scalarproduct_int16_altivec;
-
- c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
}
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 8830a22..1024226 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
+OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
@@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
+YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/apedsp.asm
new file mode 100644
index 0000000..d721ebd
--- /dev/null
+++ b/libavcodec/x86/apedsp.asm
@@ -0,0 +1,167 @@
+;******************************************************************************
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+%macro SCALARPRODUCT 0
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+; int order, int mul)
+cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
+ shl orderq, 1
+ movd m7, mulm
+%if mmsize == 16
+ pshuflw m7, m7, 0
+ punpcklqdq m7, m7
+%else
+ pshufw m7, m7, 0
+%endif
+ pxor m6, m6
+ add v1q, orderq
+ add v2q, orderq
+ add v3q, orderq
+ neg orderq
+.loop:
+ movu m0, [v2q + orderq]
+ movu m1, [v2q + orderq + mmsize]
+ mova m4, [v1q + orderq]
+ mova m5, [v1q + orderq + mmsize]
+ movu m2, [v3q + orderq]
+ movu m3, [v3q + orderq + mmsize]
+ pmaddwd m0, m4
+ pmaddwd m1, m5
+ pmullw m2, m7
+ pmullw m3, m7
+ paddd m6, m0
+ paddd m6, m1
+ paddw m2, m4
+ paddw m3, m5
+ mova [v1q + orderq], m2
+ mova [v1q + orderq + mmsize], m3
+ add orderq, mmsize*2
+ jl .loop
+%if mmsize == 16
+ movhlps m0, m6
+ paddd m6, m0
+ pshuflw m0, m6, 0x4e
+%else
+ pshufw m0, m6, 0x4e
+%endif
+ paddd m6, m0
+ movd eax, m6
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SCALARPRODUCT
+INIT_XMM sse2
+SCALARPRODUCT
+
+%macro SCALARPRODUCT_LOOP 1
+align 16
+.loop%1:
+ sub orderq, mmsize*2
+%if %1
+ mova m1, m4
+ mova m4, [v2q + orderq]
+ mova m0, [v2q + orderq + mmsize]
+ palignr m1, m0, %1
+ palignr m0, m4, %1
+ mova m3, m5
+ mova m5, [v3q + orderq]
+ mova m2, [v3q + orderq + mmsize]
+ palignr m3, m2, %1
+ palignr m2, m5, %1
+%else
+ mova m0, [v2q + orderq]
+ mova m1, [v2q + orderq + mmsize]
+ mova m2, [v3q + orderq]
+ mova m3, [v3q + orderq + mmsize]
+%endif
+ %define t0 [v1q + orderq]
+ %define t1 [v1q + orderq + mmsize]
+%if ARCH_X86_64
+ mova m8, t0
+ mova m9, t1
+ %define t0 m8
+ %define t1 m9
+%endif
+ pmaddwd m0, t0
+ pmaddwd m1, t1
+ pmullw m2, m7
+ pmullw m3, m7
+ paddw m2, t0
+ paddw m3, t1
+ paddd m6, m0
+ paddd m6, m1
+ mova [v1q + orderq], m2
+ mova [v1q + orderq + mmsize], m3
+ jg .loop%1
+%if %1
+ jmp .end
+%endif
+%endmacro
+
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+; int order, int mul)
+INIT_XMM ssse3
+cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
+ shl orderq, 1
+ movd m7, mulm
+ pshuflw m7, m7, 0
+ punpcklqdq m7, m7
+ pxor m6, m6
+ mov r4d, v2d
+ and r4d, 15
+ and v2q, ~15
+ and v3q, ~15
+ mova m4, [v2q + orderq]
+ mova m5, [v3q + orderq]
+ ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
+ cmp r4d, 0
+ je .loop0
+ cmp r4d, 2
+ je .loop2
+ cmp r4d, 4
+ je .loop4
+ cmp r4d, 6
+ je .loop6
+ cmp r4d, 8
+ je .loop8
+ cmp r4d, 10
+ je .loop10
+ cmp r4d, 12
+ je .loop12
+SCALARPRODUCT_LOOP 14
+SCALARPRODUCT_LOOP 12
+SCALARPRODUCT_LOOP 10
+SCALARPRODUCT_LOOP 8
+SCALARPRODUCT_LOOP 6
+SCALARPRODUCT_LOOP 4
+SCALARPRODUCT_LOOP 2
+SCALARPRODUCT_LOOP 0
+.end:
+ movhlps m0, m6
+ paddd m6, m0
+ pshuflw m0, m6, 0x4e
+ paddd m6, m0
+ movd eax, m6
+ RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/apedsp_init.c
new file mode 100644
index 0000000..f692c2b
--- /dev/null
+++ b/libavcodec/x86/apedsp_init.c
@@ -0,0 +1,47 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/apedsp.h"
+
+int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+
+av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
+
+ if (EXTERNAL_SSE2(cpu_flags))
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+
+ if (EXTERNAL_SSSE3(cpu_flags) &&
+ !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+}
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 684f09b..b5d6d3c 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
paddd m2, m0
movd eax, m2
RET
-
-; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
-; int order, int mul)
-cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
- shl orderq, 1
- movd m7, mulm
-%if mmsize == 16
- pshuflw m7, m7, 0
- punpcklqdq m7, m7
-%else
- pshufw m7, m7, 0
-%endif
- pxor m6, m6
- add v1q, orderq
- add v2q, orderq
- add v3q, orderq
- neg orderq
-.loop:
- movu m0, [v2q + orderq]
- movu m1, [v2q + orderq + mmsize]
- mova m4, [v1q + orderq]
- mova m5, [v1q + orderq + mmsize]
- movu m2, [v3q + orderq]
- movu m3, [v3q + orderq + mmsize]
- pmaddwd m0, m4
- pmaddwd m1, m5
- pmullw m2, m7
- pmullw m3, m7
- paddd m6, m0
- paddd m6, m1
- paddw m2, m4
- paddw m3, m5
- mova [v1q + orderq], m2
- mova [v1q + orderq + mmsize], m3
- add orderq, mmsize*2
- jl .loop
-%if mmsize == 16
- movhlps m0, m6
- paddd m6, m0
- pshuflw m0, m6, 0x4e
-%else
- pshufw m0, m6, 0x4e
-%endif
- paddd m6, m0
- movd eax, m6
- RET
%endmacro
INIT_MMX mmxext
@@ -106,97 +60,6 @@ SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT
-%macro SCALARPRODUCT_LOOP 1
-align 16
-.loop%1:
- sub orderq, mmsize*2
-%if %1
- mova m1, m4
- mova m4, [v2q + orderq]
- mova m0, [v2q + orderq + mmsize]
- palignr m1, m0, %1
- palignr m0, m4, %1
- mova m3, m5
- mova m5, [v3q + orderq]
- mova m2, [v3q + orderq + mmsize]
- palignr m3, m2, %1
- palignr m2, m5, %1
-%else
- mova m0, [v2q + orderq]
- mova m1, [v2q + orderq + mmsize]
- mova m2, [v3q + orderq]
- mova m3, [v3q + orderq + mmsize]
-%endif
- %define t0 [v1q + orderq]
- %define t1 [v1q + orderq + mmsize]
-%if ARCH_X86_64
- mova m8, t0
- mova m9, t1
- %define t0 m8
- %define t1 m9
-%endif
- pmaddwd m0, t0
- pmaddwd m1, t1
- pmullw m2, m7
- pmullw m3, m7
- paddw m2, t0
- paddw m3, t1
- paddd m6, m0
- paddd m6, m1
- mova [v1q + orderq], m2
- mova [v1q + orderq + mmsize], m3
- jg .loop%1
-%if %1
- jmp .end
-%endif
-%endmacro
-
-; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
-; int order, int mul)
-INIT_XMM ssse3
-cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
- shl orderq, 1
- movd m7, mulm
- pshuflw m7, m7, 0
- punpcklqdq m7, m7
- pxor m6, m6
- mov r4d, v2d
- and r4d, 15
- and v2q, ~15
- and v3q, ~15
- mova m4, [v2q + orderq]
- mova m5, [v3q + orderq]
- ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
- cmp r4d, 0
- je .loop0
- cmp r4d, 2
- je .loop2
- cmp r4d, 4
- je .loop4
- cmp r4d, 6
- je .loop6
- cmp r4d, 8
- je .loop8
- cmp r4d, 10
- je .loop10
- cmp r4d, 12
- je .loop12
-SCALARPRODUCT_LOOP 14
-SCALARPRODUCT_LOOP 12
-SCALARPRODUCT_LOOP 10
-SCALARPRODUCT_LOOP 8
-SCALARPRODUCT_LOOP 6
-SCALARPRODUCT_LOOP 4
-SCALARPRODUCT_LOOP 2
-SCALARPRODUCT_LOOP 0
-.end:
- movhlps m0, m6
- paddd m6, m0
- pshuflw m0, m6, 0x4e
- paddd m6, m0
- movd eax, m6
- RET
-
;-----------------------------------------------------------------------------
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 10fa166..9b0788f 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -76,15 +76,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
int order);
-int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
- const int16_t *v3,
- int order, int mul);
-int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
- const int16_t *v3,
- int order, int mul);
-int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
- const int16_t *v3,
- int order, int mul);
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
@@ -568,7 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
#endif /* HAVE_MMXEXT_EXTERNAL */
}
@@ -607,7 +597,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
#if HAVE_SSE2_EXTERNAL
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (cpu_flags & AV_CPU_FLAG_ATOM) {
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
} else {
@@ -621,8 +610,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
int cpu_flags, unsigned high_bit_depth)
{
#if HAVE_SSSE3_EXTERNAL
- if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
c->bswap_buf = ff_bswap32_buf_ssse3;
#endif /* HAVE_SSSE3_EXTERNAL */
}
--
Libav/FFmpeg packaging
More information about the pkg-multimedia-commits
mailing list