[SCM] FFmpeg packaging branch, ubuntu.karmic, updated. debian/0.5+svn20090706-1ubuntu3-28-g19547ea
siretart at users.alioth.debian.org
siretart at users.alioth.debian.org
Tue Oct 13 21:39:33 UTC 2009
The following commit has been merged in the ubuntu.karmic branch:
commit 974080412977cee359a7c2c200a413c138db2a85
Author: Loïc Minier <lool at dooz.org>
Date: Sat Oct 10 13:13:33 2009 +0200
Add backported NEON patches from ffmpeg trunk
diff --git a/debian/patches/neon/0001-ARM-NEON-optimised-add_pixels_clamped.patch b/debian/patches/neon/0001-ARM-NEON-optimised-add_pixels_clamped.patch
new file mode 100644
index 0000000..18c8647
--- /dev/null
+++ b/debian/patches/neon/0001-ARM-NEON-optimised-add_pixels_clamped.patch
@@ -0,0 +1,94 @@
+From 75eadd829625f3ef75fad613846ff98773e547ca Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Sat, 4 Apr 2009 20:18:58 +0000
+Subject: [PATCH 01/27] ARM: NEON optimised add_pixels_clamped
+
+Based on patch by David Conrad.
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@18332 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/dsputil_neon.c | 4 +++
+ libavcodec/arm/dsputil_neon_s.S | 45 +++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 49 insertions(+), 0 deletions(-)
+
+diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
+index e18a487..2e56308 100644
+--- a/libavcodec/arm/dsputil_neon.c
++++ b/libavcodec/arm/dsputil_neon.c
+@@ -41,6 +41,8 @@ void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+
+ void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+
++void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
++
+ void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+ void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
+ void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
+@@ -176,6 +178,8 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+
++ c->add_pixels_clamped = ff_add_pixels_clamped_neon;
++
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+
+diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
+index 3b39d2e..c305210 100644
+--- a/libavcodec/arm/dsputil_neon_s.S
++++ b/libavcodec/arm/dsputil_neon_s.S
+@@ -273,6 +273,51 @@ function ff_put_h264_qpel8_mc00_neon, export=1
+ pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
+ pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
+
++function ff_add_pixels_clamped_neon, export=1
++ mov r3, r1
++ vld1.64 {d16}, [r1,:64], r2
++ vld1.64 {d0-d1}, [r0,:128]!
++ vaddw.u8 q0, q0, d16
++ vld1.64 {d17}, [r1,:64], r2
++ vld1.64 {d2-d3}, [r0,:128]!
++ vqmovun.s16 d0, q0
++ vld1.64 {d18}, [r1,:64], r2
++ vaddw.u8 q1, q1, d17
++ vld1.64 {d4-d5}, [r0,:128]!
++ vaddw.u8 q2, q2, d18
++ vst1.64 {d0}, [r3,:64], r2
++ vqmovun.s16 d2, q1
++ vld1.64 {d19}, [r1,:64], r2
++ vld1.64 {d6-d7}, [r0,:128]!
++ vaddw.u8 q3, q3, d19
++ vqmovun.s16 d4, q2
++ vst1.64 {d2}, [r3,:64], r2
++ vld1.64 {d16}, [r1,:64], r2
++ vqmovun.s16 d6, q3
++ vld1.64 {d0-d1}, [r0,:128]!
++ vaddw.u8 q0, q0, d16
++ vst1.64 {d4}, [r3,:64], r2
++ vld1.64 {d17}, [r1,:64], r2
++ vld1.64 {d2-d3}, [r0,:128]!
++ vaddw.u8 q1, q1, d17
++ vst1.64 {d6}, [r3,:64], r2
++ vqmovun.s16 d0, q0
++ vld1.64 {d18}, [r1,:64], r2
++ vld1.64 {d4-d5}, [r0,:128]!
++ vaddw.u8 q2, q2, d18
++ vst1.64 {d0}, [r3,:64], r2
++ vqmovun.s16 d2, q1
++ vld1.64 {d19}, [r1,:64], r2
++ vqmovun.s16 d4, q2
++ vld1.64 {d6-d7}, [r0,:128]!
++ vaddw.u8 q3, q3, d19
++ vst1.64 {d2}, [r3,:64], r2
++ vqmovun.s16 d6, q3
++ vst1.64 {d4}, [r3,:64], r2
++ vst1.64 {d6}, [r3,:64], r2
++ bx lr
++ .endfunc
++
+ function ff_float_to_int16_neon, export=1
+ subs r2, r2, #8
+ vld1.64 {d0-d1}, [r1,:128]!
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0002-ARM-NEON-optimized-put_signed_pixels_clamped.patch b/debian/patches/neon/0002-ARM-NEON-optimized-put_signed_pixels_clamped.patch
new file mode 100644
index 0000000..b75d7f7
--- /dev/null
+++ b/debian/patches/neon/0002-ARM-NEON-optimized-put_signed_pixels_clamped.patch
@@ -0,0 +1,82 @@
+From 5bf2745fa3bdc996f2201c06eeff1d242d81cc2a Mon Sep 17 00:00:00 2001
+From: conrad <conrad at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Sat, 4 Apr 2009 21:02:48 +0000
+Subject: [PATCH 02/27] ARM: NEON optimized put_signed_pixels_clamped
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@18333 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/dsputil_neon.c | 2 ++
+ libavcodec/arm/dsputil_neon_s.S | 37 +++++++++++++++++++++++++++++++++++++
+ 2 files changed, 39 insertions(+), 0 deletions(-)
+
+diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
+index 2e56308..37425a3 100644
+--- a/libavcodec/arm/dsputil_neon.c
++++ b/libavcodec/arm/dsputil_neon.c
+@@ -42,6 +42,7 @@ void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+ void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+
+ void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
++void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
+
+ void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+ void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
+@@ -179,6 +180,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+
+ c->add_pixels_clamped = ff_add_pixels_clamped_neon;
++ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
+index c305210..5b95717 100644
+--- a/libavcodec/arm/dsputil_neon_s.S
++++ b/libavcodec/arm/dsputil_neon_s.S
+@@ -273,6 +273,43 @@ function ff_put_h264_qpel8_mc00_neon, export=1
+ pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
+ pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
+
++function ff_put_signed_pixels_clamped_neon, export=1
++ vmov.u8 d31, #128
++ vld1.64 {d16-d17}, [r0,:128]!
++ vqmovn.s16 d0, q8
++ vld1.64 {d18-d19}, [r0,:128]!
++ vqmovn.s16 d1, q9
++ vld1.64 {d16-d17}, [r0,:128]!
++ vqmovn.s16 d2, q8
++ vld1.64 {d18-d19}, [r0,:128]!
++ vadd.u8 d0, d0, d31
++ vld1.64 {d20-d21}, [r0,:128]!
++ vadd.u8 d1, d1, d31
++ vld1.64 {d22-d23}, [r0,:128]!
++ vadd.u8 d2, d2, d31
++ vst1.64 {d0}, [r1,:64], r2
++ vqmovn.s16 d3, q9
++ vst1.64 {d1}, [r1,:64], r2
++ vqmovn.s16 d4, q10
++ vst1.64 {d2}, [r1,:64], r2
++ vqmovn.s16 d5, q11
++ vld1.64 {d24-d25}, [r0,:128]!
++ vadd.u8 d3, d3, d31
++ vld1.64 {d26-d27}, [r0,:128]!
++ vadd.u8 d4, d4, d31
++ vadd.u8 d5, d5, d31
++ vst1.64 {d3}, [r1,:64], r2
++ vqmovn.s16 d6, q12
++ vst1.64 {d4}, [r1,:64], r2
++ vqmovn.s16 d7, q13
++ vst1.64 {d5}, [r1,:64], r2
++ vadd.u8 d6, d6, d31
++ vadd.u8 d7, d7, d31
++ vst1.64 {d6}, [r1,:64], r2
++ vst1.64 {d7}, [r1,:64], r2
++ bx lr
++ .endfunc
++
+ function ff_add_pixels_clamped_neon, export=1
+ mov r3, r1
+ vld1.64 {d16}, [r1,:64], r2
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0003-Add-guaranteed-alignment-for-loading-dest-pixels-in-.patch b/debian/patches/neon/0003-Add-guaranteed-alignment-for-loading-dest-pixels-in-.patch
new file mode 100644
index 0000000..f348326
--- /dev/null
+++ b/debian/patches/neon/0003-Add-guaranteed-alignment-for-loading-dest-pixels-in-.patch
@@ -0,0 +1,35 @@
+From ce7bbcc96ff94b2fdfbcff35f517f03512bc147b Mon Sep 17 00:00:00 2001
+From: conrad <conrad at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Thu, 16 Apr 2009 08:39:13 +0000
+Subject: [PATCH 03/27] Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@18535 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/dsputil_neon_s.S | 8 ++++----
+ 1 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
+index 5b95717..f16293d 100644
+--- a/libavcodec/arm/dsputil_neon_s.S
++++ b/libavcodec/arm/dsputil_neon_s.S
+@@ -38,13 +38,13 @@
+ pld [r1, r2]
+ pld [r1, r2, lsl #1]
+ .if \avg
+- vld1.64 {d16,d17}, [ip], r2
++ vld1.64 {d16,d17}, [ip,:128], r2
+ vrhadd.u8 q0, q0, q8
+- vld1.64 {d18,d19}, [ip], r2
++ vld1.64 {d18,d19}, [ip,:128], r2
+ vrhadd.u8 q1, q1, q9
+- vld1.64 {d20,d21}, [ip], r2
++ vld1.64 {d20,d21}, [ip,:128], r2
+ vrhadd.u8 q2, q2, q10
+- vld1.64 {d22,d23}, [ip], r2
++ vld1.64 {d22,d23}, [ip,:128], r2
+ vrhadd.u8 q3, q3, q11
+ .endif
+ subs r3, r3, #4
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0004-Reorganise-intreadwrite.h.patch b/debian/patches/neon/0004-Reorganise-intreadwrite.h.patch
new file mode 100644
index 0000000..81ad80c
--- /dev/null
+++ b/debian/patches/neon/0004-Reorganise-intreadwrite.h.patch
@@ -0,0 +1,312 @@
+From f4bfca647b7228833d0e102f68d0726594c502b1 Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Sat, 18 Apr 2009 00:00:22 +0000
+Subject: [PATCH 04/27] Reorganise intreadwrite.h
+
+This changes intreadwrite.h to support per-arch implementations of the
+various macros allowing us to take advantage of special instructions
+or other properties the compiler does not know about.
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@18600 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavutil/intreadwrite.h | 213 ++++++++++++++++++++++++++++++----------------
+ 1 files changed, 141 insertions(+), 72 deletions(-)
+
+diff --git a/libavutil/intreadwrite.h b/libavutil/intreadwrite.h
+index 7c5909e..b1c5c2a 100644
+--- a/libavutil/intreadwrite.h
++++ b/libavutil/intreadwrite.h
+@@ -23,119 +23,88 @@
+ #include "config.h"
+ #include "bswap.h"
+
+-#ifdef __GNUC__
++/*
++ * Arch-specific headers can provide any combination of
++ * AV_[RW][BLN](16|32|64) macros. Preprocessor symbols must be
++ * defined, even if these are implemented as inline functions.
++ */
++
++
++/*
++ * Define AV_[RW]N helper macros to simplify definitions not provided
++ * by per-arch headers.
++ */
++
++#if defined(__GNUC__)
+
+ struct unaligned_64 { uint64_t l; } __attribute__((packed));
+ struct unaligned_32 { uint32_t l; } __attribute__((packed));
+ struct unaligned_16 { uint16_t l; } __attribute__((packed));
+
+-#define AV_RN16(a) (((const struct unaligned_16 *) (a))->l)
+-#define AV_RN32(a) (((const struct unaligned_32 *) (a))->l)
+-#define AV_RN64(a) (((const struct unaligned_64 *) (a))->l)
+-
+-#define AV_WN16(a, b) (((struct unaligned_16 *) (a))->l) = (b)
+-#define AV_WN32(a, b) (((struct unaligned_32 *) (a))->l) = (b)
+-#define AV_WN64(a, b) (((struct unaligned_64 *) (a))->l) = (b)
++# define AV_RN(s, p) (((const struct unaligned_##s *) (p))->l)
++# define AV_WN(s, p, v) (((struct unaligned_##s *) (p))->l) = (v)
+
+ #elif defined(__DECC)
+
+-#define AV_RN16(a) (*((const __unaligned uint16_t*)(a)))
+-#define AV_RN32(a) (*((const __unaligned uint32_t*)(a)))
+-#define AV_RN64(a) (*((const __unaligned uint64_t*)(a)))
+-
+-#define AV_WN16(a, b) *((__unaligned uint16_t*)(a)) = (b)
+-#define AV_WN32(a, b) *((__unaligned uint32_t*)(a)) = (b)
+-#define AV_WN64(a, b) *((__unaligned uint64_t*)(a)) = (b)
+-
+-#else
+-
+-#define AV_RN16(a) (*((const uint16_t*)(a)))
+-#define AV_RN32(a) (*((const uint32_t*)(a)))
+-#define AV_RN64(a) (*((const uint64_t*)(a)))
+-
+-#define AV_WN16(a, b) *((uint16_t*)(a)) = (b)
+-#define AV_WN32(a, b) *((uint32_t*)(a)) = (b)
+-#define AV_WN64(a, b) *((uint64_t*)(a)) = (b)
+-
+-#endif /* !__GNUC__ */
+-
+-/* endian macros */
+-#define AV_RB8(x) (((const uint8_t*)(x))[0])
+-#define AV_WB8(p, d) do { ((uint8_t*)(p))[0] = (d); } while(0)
+-
+-#define AV_RL8(x) AV_RB8(x)
+-#define AV_WL8(p, d) AV_WB8(p, d)
+-
+-#if HAVE_FAST_UNALIGNED
+-# ifdef WORDS_BIGENDIAN
+-# define AV_RB16(x) AV_RN16(x)
+-# define AV_WB16(p, d) AV_WN16(p, d)
+-
+-# define AV_RL16(x) bswap_16(AV_RN16(x))
+-# define AV_WL16(p, d) AV_WN16(p, bswap_16(d))
+-
+-# define AV_RB32(x) AV_RN32(x)
+-# define AV_WB32(p, d) AV_WN32(p, d)
++# define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p)))
++# define AV_WN(s, p, v) *((__unaligned uint##s##_t*)(p)) = (v)
+
+-# define AV_RL32(x) bswap_32(AV_RN32(x))
+-# define AV_WL32(p, d) AV_WN32(p, bswap_32(d))
++#elif HAVE_FAST_UNALIGNED
+
+-# define AV_RB64(x) AV_RN64(x)
+-# define AV_WB64(p, d) AV_WN64(p, d)
++# define AV_RN(s, p) (*((const uint##s##_t*)(p)))
++# define AV_WN(s, p, v) *((uint##s##_t*)(p)) = (v)
+
+-# define AV_RL64(x) bswap_64(AV_RN64(x))
+-# define AV_WL64(p, d) AV_WN64(p, bswap_64(d))
+-# else /* WORDS_BIGENDIAN */
+-# define AV_RB16(x) bswap_16(AV_RN16(x))
+-# define AV_WB16(p, d) AV_WN16(p, bswap_16(d))
+-
+-# define AV_RL16(x) AV_RN16(x)
+-# define AV_WL16(p, d) AV_WN16(p, d)
+-
+-# define AV_RB32(x) bswap_32(AV_RN32(x))
+-# define AV_WB32(p, d) AV_WN32(p, bswap_32(d))
+-
+-# define AV_RL32(x) AV_RN32(x)
+-# define AV_WL32(p, d) AV_WN32(p, d)
+-
+-# define AV_RB64(x) bswap_64(AV_RN64(x))
+-# define AV_WB64(p, d) AV_WN64(p, bswap_64(d))
++#else
+
+-# define AV_RL64(x) AV_RN64(x)
+-# define AV_WL64(p, d) AV_WN64(p, d)
+-# endif
+-#else /* HAVE_FAST_UNALIGNED */
+-#define AV_RB16(x) ((((const uint8_t*)(x))[0] << 8) | ((const uint8_t*)(x))[1])
++#ifndef AV_RB16
++#define AV_RB16(x) ((((const uint8_t*)(x))[0] << 8) | \
++ ((const uint8_t*)(x))[1])
++#endif
++#ifndef AV_WB16
+ #define AV_WB16(p, d) do { \
+ ((uint8_t*)(p))[1] = (d); \
+ ((uint8_t*)(p))[0] = (d)>>8; } while(0)
++#endif
+
++#ifndef AV_RL16
+ #define AV_RL16(x) ((((const uint8_t*)(x))[1] << 8) | \
+ ((const uint8_t*)(x))[0])
++#endif
++#ifndef AV_WL16
+ #define AV_WL16(p, d) do { \
+ ((uint8_t*)(p))[0] = (d); \
+ ((uint8_t*)(p))[1] = (d)>>8; } while(0)
++#endif
+
++#ifndef AV_RB32
+ #define AV_RB32(x) ((((const uint8_t*)(x))[0] << 24) | \
+ (((const uint8_t*)(x))[1] << 16) | \
+ (((const uint8_t*)(x))[2] << 8) | \
+ ((const uint8_t*)(x))[3])
++#endif
++#ifndef AV_WB32
+ #define AV_WB32(p, d) do { \
+ ((uint8_t*)(p))[3] = (d); \
+ ((uint8_t*)(p))[2] = (d)>>8; \
+ ((uint8_t*)(p))[1] = (d)>>16; \
+ ((uint8_t*)(p))[0] = (d)>>24; } while(0)
++#endif
+
++#ifndef AV_RL32
+ #define AV_RL32(x) ((((const uint8_t*)(x))[3] << 24) | \
+ (((const uint8_t*)(x))[2] << 16) | \
+ (((const uint8_t*)(x))[1] << 8) | \
+ ((const uint8_t*)(x))[0])
++#endif
++#ifndef AV_WL32
+ #define AV_WL32(p, d) do { \
+ ((uint8_t*)(p))[0] = (d); \
+ ((uint8_t*)(p))[1] = (d)>>8; \
+ ((uint8_t*)(p))[2] = (d)>>16; \
+ ((uint8_t*)(p))[3] = (d)>>24; } while(0)
++#endif
+
++#ifndef AV_RB64
+ #define AV_RB64(x) (((uint64_t)((const uint8_t*)(x))[0] << 56) | \
+ ((uint64_t)((const uint8_t*)(x))[1] << 48) | \
+ ((uint64_t)((const uint8_t*)(x))[2] << 40) | \
+@@ -144,6 +113,8 @@ struct unaligned_16 { uint16_t l; } __attribute__((packed));
+ ((uint64_t)((const uint8_t*)(x))[5] << 16) | \
+ ((uint64_t)((const uint8_t*)(x))[6] << 8) | \
+ (uint64_t)((const uint8_t*)(x))[7])
++#endif
++#ifndef AV_WB64
+ #define AV_WB64(p, d) do { \
+ ((uint8_t*)(p))[7] = (d); \
+ ((uint8_t*)(p))[6] = (d)>>8; \
+@@ -153,7 +124,9 @@ struct unaligned_16 { uint16_t l; } __attribute__((packed));
+ ((uint8_t*)(p))[2] = (d)>>40; \
+ ((uint8_t*)(p))[1] = (d)>>48; \
+ ((uint8_t*)(p))[0] = (d)>>56; } while(0)
++#endif
+
++#ifndef AV_RL64
+ #define AV_RL64(x) (((uint64_t)((const uint8_t*)(x))[7] << 56) | \
+ ((uint64_t)((const uint8_t*)(x))[6] << 48) | \
+ ((uint64_t)((const uint8_t*)(x))[5] << 40) | \
+@@ -162,6 +135,8 @@ struct unaligned_16 { uint16_t l; } __attribute__((packed));
+ ((uint64_t)((const uint8_t*)(x))[2] << 16) | \
+ ((uint64_t)((const uint8_t*)(x))[1] << 8) | \
+ (uint64_t)((const uint8_t*)(x))[0])
++#endif
++#ifndef AV_WL64
+ #define AV_WL64(p, d) do { \
+ ((uint8_t*)(p))[0] = (d); \
+ ((uint8_t*)(p))[1] = (d)>>8; \
+@@ -171,7 +146,101 @@ struct unaligned_16 { uint16_t l; } __attribute__((packed));
+ ((uint8_t*)(p))[5] = (d)>>40; \
+ ((uint8_t*)(p))[6] = (d)>>48; \
+ ((uint8_t*)(p))[7] = (d)>>56; } while(0)
+-#endif /* HAVE_FAST_UNALIGNED */
++#endif
++
++#ifdef WORDS_BIGENDIAN
++# define AV_RN(s, p) AV_RB##s(p)
++# define AV_WN(s, p, v) AV_WB##s(p, v)
++#else
++# define AV_RN(s, p) AV_RL##s(p)
++# define AV_WN(s, p, v) AV_WL##s(p, v)
++#endif
++
++#endif /* HAVE_FAST_UNALIGNED */
++
++#ifndef AV_RN16
++# define AV_RN16(p) AV_RN(16, p)
++#endif
++
++#ifndef AV_RN32
++# define AV_RN32(p) AV_RN(32, p)
++#endif
++
++#ifndef AV_RN64
++# define AV_RN64(p) AV_RN(64, p)
++#endif
++
++#ifndef AV_WN16
++# define AV_WN16(p, v) AV_WN(16, p, v)
++#endif
++
++#ifndef AV_WN32
++# define AV_WN32(p, v) AV_WN(32, p, v)
++#endif
++
++#ifndef AV_WN64
++# define AV_WN64(p, v) AV_WN(64, p, v)
++#endif
++
++#ifdef WORDS_BIGENDIAN
++# define AV_RB(s, p) AV_RN(s, p)
++# define AV_WB(s, p, v) AV_WN(s, p, v)
++# define AV_RL(s, p) bswap_##s(AV_RN(s, p))
++# define AV_WL(s, p, v) AV_WN(s, p, bswap_##s(v))
++#else
++# define AV_RB(s, p) bswap_##s(AV_RN(s, p))
++# define AV_WB(s, p, v) AV_WN(s, p, bswap_##s(v))
++# define AV_RL(s, p) AV_RN(s, p)
++# define AV_WL(s, p, v) AV_WN(s, p, v)
++#endif
++
++#define AV_RB8(x) (((const uint8_t*)(x))[0])
++#define AV_WB8(p, d) do { ((uint8_t*)(p))[0] = (d); } while(0)
++
++#define AV_RL8(x) AV_RB8(x)
++#define AV_WL8(p, d) AV_WB8(p, d)
++
++#ifndef AV_RB16
++# define AV_RB16(p) AV_RB(16, p)
++#endif
++#ifndef AV_WB16
++# define AV_WB16(p, v) AV_WB(16, p, v)
++#endif
++
++#ifndef AV_RL16
++# define AV_RL16(p) AV_RL(16, p)
++#endif
++#ifndef AV_WL16
++# define AV_WL16(p, v) AV_WL(16, p, v)
++#endif
++
++#ifndef AV_RB32
++# define AV_RB32(p) AV_RB(32, p)
++#endif
++#ifndef AV_WB32
++# define AV_WB32(p, v) AV_WB(32, p, v)
++#endif
++
++#ifndef AV_RL32
++# define AV_RL32(p) AV_RL(32, p)
++#endif
++#ifndef AV_WL32
++# define AV_WL32(p, v) AV_WL(32, p, v)
++#endif
++
++#ifndef AV_RB64
++# define AV_RB64(p) AV_RB(64, p)
++#endif
++#ifndef AV_WB64
++# define AV_WB64(p, v) AV_WB(64, p, v)
++#endif
++
++#ifndef AV_RL64
++# define AV_RL64(p) AV_RL(64, p)
++#endif
++#ifndef AV_WL64
++# define AV_WL64(p, v) AV_WL(64, p, v)
++#endif
+
+ #define AV_RB24(x) ((((const uint8_t*)(x))[0] << 16) | \
+ (((const uint8_t*)(x))[1] << 8) | \
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0005-ARM-asm-for-AV_RN.patch b/debian/patches/neon/0005-ARM-asm-for-AV_RN.patch
new file mode 100644
index 0000000..35f3895
--- /dev/null
+++ b/debian/patches/neon/0005-ARM-asm-for-AV_RN.patch
@@ -0,0 +1,124 @@
+From 8e2336d4b05585ed3a533b388751bff4f3cfef05 Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Sat, 18 Apr 2009 00:00:28 +0000
+Subject: [PATCH 05/27] ARM asm for AV_RN*()
+
+ARMv6 and later support unaligned loads and stores for single
+word/halfword but not double/multiple. GCC is ignorant of this and
+will always use bytewise accesses for unaligned data. Casting to an
+int32_t pointer is dangerous since a load/store double or multiple
+instruction might be used (this happens with some code in FFmpeg).
+Implementing the AV_[RW]* macros with inline asm using only supported
+instructions gives fast and safe unaligned accesses. ARM RVCT does
+the right thing with generic code.
+
+This gives an overall speedup of up to 10%.
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@18601 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavutil/arm/intreadwrite.h | 78 ++++++++++++++++++++++++++++++++++++++++++
+ libavutil/intreadwrite.h | 3 ++
+ 2 files changed, 81 insertions(+), 0 deletions(-)
+ create mode 100644 libavutil/arm/intreadwrite.h
+
+diff --git a/libavutil/arm/intreadwrite.h b/libavutil/arm/intreadwrite.h
+new file mode 100644
+index 0000000..de2e553
+--- /dev/null
++++ b/libavutil/arm/intreadwrite.h
+@@ -0,0 +1,78 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVUTIL_ARM_INTREADWRITE_H
++#define AVUTIL_ARM_INTREADWRITE_H
++
++#include <stdint.h>
++#include "config.h"
++
++#if HAVE_FAST_UNALIGNED && HAVE_INLINE_ASM
++
++#define AV_RN16 AV_RN16
++static inline uint16_t AV_RN16(const void *p)
++{
++ uint16_t v;
++ __asm__ ("ldrh %0, %1" : "=r"(v) : "m"(*(const uint16_t *)p));
++ return v;
++}
++
++#define AV_WN16 AV_WN16
++static inline void AV_WN16(void *p, uint16_t v)
++{
++ __asm__ ("strh %1, %0" : "=m"(*(uint16_t *)p) : "r"(v));
++}
++
++#define AV_RN32 AV_RN32
++static inline uint32_t AV_RN32(const void *p)
++{
++ uint32_t v;
++ __asm__ ("ldr %0, %1" : "=r"(v) : "m"(*(const uint32_t *)p));
++ return v;
++}
++
++#define AV_WN32 AV_WN32
++static inline void AV_WN32(void *p, uint32_t v)
++{
++ __asm__ ("str %1, %0" : "=m"(*(uint32_t *)p) : "r"(v));
++}
++
++#define AV_RN64 AV_RN64
++static inline uint64_t AV_RN64(const void *p)
++{
++ union { uint64_t v; uint32_t hl[2]; } v;
++ __asm__ ("ldr %0, %2 \n\t"
++ "ldr %1, %3 \n\t"
++ : "=r"(v.hl[0]), "=r"(v.hl[1])
++ : "m"(*(const uint32_t*)p), "m"(*((const uint32_t*)p+1)));
++ return v.v;
++}
++
++#define AV_WN64 AV_WN64
++static inline void AV_WN64(void *p, uint64_t v)
++{
++ union { uint64_t v; uint32_t hl[2]; } vv = { v };
++ __asm__ ("str %2, %0 \n\t"
++ "str %3, %1 \n\t"
++ : "=m"(*(uint32_t*)p), "=m"(*((uint32_t*)p+1))
++ : "r"(vv.hl[0]), "r"(vv.hl[1]));
++}
++
++#endif /* HAVE_INLINE_ASM */
++
++#endif /* AVUTIL_ARM_INTREADWRITE_H */
+diff --git a/libavutil/intreadwrite.h b/libavutil/intreadwrite.h
+index b1c5c2a..42fb890 100644
+--- a/libavutil/intreadwrite.h
++++ b/libavutil/intreadwrite.h
+@@ -29,6 +29,9 @@
+ * defined, even if these are implemented as inline functions.
+ */
+
++#if ARCH_ARM
++# include "arm/intreadwrite.h"
++#endif
+
+ /*
+ * Define AV_[RW]N helper macros to simplify definitions not provided
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0006-ARM-NEON-put_pixels_clamped.patch b/debian/patches/neon/0006-ARM-NEON-put_pixels_clamped.patch
new file mode 100644
index 0000000..f7f6874
--- /dev/null
+++ b/debian/patches/neon/0006-ARM-NEON-put_pixels_clamped.patch
@@ -0,0 +1,69 @@
+From 9744978dd7b9ca9dda0846fa82faf6fa264a52a7 Mon Sep 17 00:00:00 2001
+From: conrad <conrad at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Wed, 29 Apr 2009 11:31:43 +0000
+Subject: [PATCH 06/27] ARM: NEON put_pixels_clamped
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@18712 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/dsputil_neon.c | 2 ++
+ libavcodec/arm/dsputil_neon_s.S | 24 ++++++++++++++++++++++++
+ 2 files changed, 26 insertions(+), 0 deletions(-)
+
+diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
+index 37425a3..9b95130 100644
+--- a/libavcodec/arm/dsputil_neon.c
++++ b/libavcodec/arm/dsputil_neon.c
+@@ -42,6 +42,7 @@ void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+ void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+
+ void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
++void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
+ void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
+
+ void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+@@ -180,6 +181,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+
+ c->add_pixels_clamped = ff_add_pixels_clamped_neon;
++ c->put_pixels_clamped = ff_put_pixels_clamped_neon;
+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
+index f16293d..a55e05f 100644
+--- a/libavcodec/arm/dsputil_neon_s.S
++++ b/libavcodec/arm/dsputil_neon_s.S
+@@ -273,6 +273,30 @@ function ff_put_h264_qpel8_mc00_neon, export=1
+ pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
+ pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
+
++function ff_put_pixels_clamped_neon, export=1
++ vld1.64 {d16-d19}, [r0,:128]!
++ vqmovun.s16 d0, q8
++ vld1.64 {d20-d23}, [r0,:128]!
++ vqmovun.s16 d1, q9
++ vld1.64 {d24-d27}, [r0,:128]!
++ vqmovun.s16 d2, q10
++ vld1.64 {d28-d31}, [r0,:128]!
++ vqmovun.s16 d3, q11
++ vst1.64 {d0}, [r1,:64], r2
++ vqmovun.s16 d4, q12
++ vst1.64 {d1}, [r1,:64], r2
++ vqmovun.s16 d5, q13
++ vst1.64 {d2}, [r1,:64], r2
++ vqmovun.s16 d6, q14
++ vst1.64 {d3}, [r1,:64], r2
++ vqmovun.s16 d7, q15
++ vst1.64 {d4}, [r1,:64], r2
++ vst1.64 {d5}, [r1,:64], r2
++ vst1.64 {d6}, [r1,:64], r2
++ vst1.64 {d7}, [r1,:64], r2
++ bx lr
++ .endfunc
++
+ function ff_put_signed_pixels_clamped_neon, export=1
+ vmov.u8 d31, #128
+ vld1.64 {d16-d17}, [r0,:128]!
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0007-ARM-Use-fewer-register-in-NEON-put_pixels-_y2-and-_x.patch b/debian/patches/neon/0007-ARM-Use-fewer-register-in-NEON-put_pixels-_y2-and-_x.patch
new file mode 100644
index 0000000..3789b2d
--- /dev/null
+++ b/debian/patches/neon/0007-ARM-Use-fewer-register-in-NEON-put_pixels-_y2-and-_x.patch
@@ -0,0 +1,168 @@
+From dcd9d80a563a1877e1e5f9182d1bf0ac08ccc7d4 Mon Sep 17 00:00:00 2001
+From: conrad <conrad at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Wed, 29 Apr 2009 11:38:09 +0000
+Subject: [PATCH 07/27] ARM: Use fewer register in NEON put_pixels _y2 and _xy2
+ Approved by Mans on IRC
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@18713 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/dsputil_neon_s.S | 64 ++++++++++++++++-----------------------
+ 1 files changed, 26 insertions(+), 38 deletions(-)
+
+diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
+index a55e05f..303b11c 100644
+--- a/libavcodec/arm/dsputil_neon_s.S
++++ b/libavcodec/arm/dsputil_neon_s.S
+@@ -73,35 +73,29 @@
+ .endm
+
+ .macro pixels16_y2 vhadd=vrhadd.u8
+- push {lr}
+- add ip, r1, r2
+- lsl lr, r2, #1
+- vld1.64 {d0, d1}, [r1], lr
+- vld1.64 {d2, d3}, [ip], lr
++ vld1.64 {d0, d1}, [r1], r2
++ vld1.64 {d2, d3}, [r1], r2
+ 1: subs r3, r3, #2
+ \vhadd q2, q0, q1
+- vld1.64 {d0, d1}, [r1], lr
++ vld1.64 {d0, d1}, [r1], r2
+ \vhadd q3, q0, q1
+- vld1.64 {d2, d3}, [ip], lr
++ vld1.64 {d2, d3}, [r1], r2
+ pld [r1]
+- pld [ip]
++ pld [r1, r2]
+ vst1.64 {d4, d5}, [r0,:128], r2
+ vst1.64 {d6, d7}, [r0,:128], r2
+ bne 1b
+- pop {pc}
++ bx lr
+ .endm
+
+ .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
+- push {lr}
+- lsl lr, r2, #1
+- add ip, r1, r2
+- vld1.64 {d0-d2}, [r1], lr
+- vld1.64 {d4-d6}, [ip], lr
++ vld1.64 {d0-d2}, [r1], r2
++ vld1.64 {d4-d6}, [r1], r2
+ .if \no_rnd
+ vmov.i16 q13, #1
+ .endif
+ pld [r1]
+- pld [ip]
++ pld [r1, r2]
+ vext.8 q1, q0, q1, #1
+ vext.8 q3, q2, q3, #1
+ vaddl.u8 q8, d0, d2
+@@ -109,7 +103,7 @@
+ vaddl.u8 q9, d4, d6
+ vaddl.u8 q11, d5, d7
+ 1: subs r3, r3, #2
+- vld1.64 {d0-d2}, [r1], lr
++ vld1.64 {d0-d2}, [r1], r2
+ vadd.u16 q12, q8, q9
+ pld [r1]
+ .if \no_rnd
+@@ -123,11 +117,11 @@
+ .endif
+ \vshrn d29, q1, #2
+ vaddl.u8 q8, d0, d30
+- vld1.64 {d2-d4}, [ip], lr
++ vld1.64 {d2-d4}, [r1], r2
+ vaddl.u8 q10, d1, d31
+ vst1.64 {d28,d29}, [r0,:128], r2
+ vadd.u16 q12, q8, q9
+- pld [ip]
++ pld [r1, r2]
+ .if \no_rnd
+ vadd.u16 q12, q12, q13
+ .endif
+@@ -142,7 +136,7 @@
+ vaddl.u8 q11, d3, d5
+ vst1.64 {d30,d31}, [r0,:128], r2
+ bgt 1b
+- pop {pc}
++ bx lr
+ .endm
+
+ .macro pixels8
+@@ -180,41 +174,35 @@
+ .endm
+
+ .macro pixels8_y2 vhadd=vrhadd.u8
+- push {lr}
+- add ip, r1, r2
+- lsl lr, r2, #1
+- vld1.64 {d0}, [r1], lr
+- vld1.64 {d1}, [ip], lr
++ vld1.64 {d0}, [r1], r2
++ vld1.64 {d1}, [r1], r2
+ 1: subs r3, r3, #2
+ \vhadd d4, d0, d1
+- vld1.64 {d0}, [r1], lr
++ vld1.64 {d0}, [r1], r2
+ \vhadd d5, d0, d1
+- vld1.64 {d1}, [ip], lr
++ vld1.64 {d1}, [r1], r2
+ pld [r1]
+- pld [ip]
++ pld [r1, r2]
+ vst1.64 {d4}, [r0,:64], r2
+ vst1.64 {d5}, [r0,:64], r2
+ bne 1b
+- pop {pc}
++ bx lr
+ .endm
+
+ .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
+- push {lr}
+- lsl lr, r2, #1
+- add ip, r1, r2
+- vld1.64 {d0, d1}, [r1], lr
+- vld1.64 {d2, d3}, [ip], lr
++ vld1.64 {d0, d1}, [r1], r2
++ vld1.64 {d2, d3}, [r1], r2
+ .if \no_rnd
+ vmov.i16 q11, #1
+ .endif
+ pld [r1]
+- pld [ip]
++ pld [r1, r2]
+ vext.8 d4, d0, d1, #1
+ vext.8 d6, d2, d3, #1
+ vaddl.u8 q8, d0, d4
+ vaddl.u8 q9, d2, d6
+ 1: subs r3, r3, #2
+- vld1.64 {d0, d1}, [r1], lr
++ vld1.64 {d0, d1}, [r1], r2
+ pld [r1]
+ vadd.u16 q10, q8, q9
+ vext.8 d4, d0, d1, #1
+@@ -223,9 +211,9 @@
+ .endif
+ vaddl.u8 q8, d0, d4
+ \vshrn d5, q10, #2
+- vld1.64 {d2, d3}, [ip], lr
++ vld1.64 {d2, d3}, [r1], r2
+ vadd.u16 q10, q8, q9
+- pld [ip]
++ pld [r1, r2]
+ .if \no_rnd
+ vadd.u16 q10, q10, q11
+ .endif
+@@ -235,7 +223,7 @@
+ vaddl.u8 q9, d2, d6
+ vst1.64 {d7}, [r0,:64], r2
+ bgt 1b
+- pop {pc}
++ bx lr
+ .endm
+
+ .macro pixfunc pfx name suf rnd_op args:vararg
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0008-ARM-NEON-VP3-Loop-Filter.patch b/debian/patches/neon/0008-ARM-NEON-VP3-Loop-Filter.patch
new file mode 100644
index 0000000..fa92f76
--- /dev/null
+++ b/debian/patches/neon/0008-ARM-NEON-VP3-Loop-Filter.patch
@@ -0,0 +1,60 @@
+From 91dafadca2744957e6cc3ae13688c9033ec3c330 Mon Sep 17 00:00:00 2001
+From: conrad <conrad at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Sat, 23 May 2009 18:36:20 +0000
+Subject: [PATCH 08/27] ARM: NEON VP3 Loop Filter
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@18916 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/Makefile | 4 ++++
+ libavcodec/arm/dsputil_neon.c | 8 ++++++++
+ 2 files changed, 12 insertions(+), 0 deletions(-)
+
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index cf742d1..a213309 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -479,11 +479,15 @@ OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \
+ OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
+ arm/mpegvideo_iwmmxt.o \
+
++NEON-OBJS-$(CONFIG_THEORA_DECODER) += arm/vp3dsp_neon.o
++NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o
++
+ OBJS-$(HAVE_NEON) += arm/dsputil_neon.o \
+ arm/dsputil_neon_s.o \
+ arm/h264dsp_neon.o \
+ arm/h264idct_neon.o \
+ arm/simple_idct_neon.o \
++ $(NEON-OBJS-yes)
+
+ OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \
+ bfin/fdct_bfin.o \
+diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
+index 9b95130..20425c1 100644
+--- a/libavcodec/arm/dsputil_neon.c
++++ b/libavcodec/arm/dsputil_neon.c
+@@ -150,6 +150,9 @@ void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
+ DCTELEM *block, int stride,
+ const uint8_t nnzc[6*8]);
+
++void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
++void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
++
+ void ff_vector_fmul_neon(float *dst, const float *src, int len);
+ void ff_vector_fmul_window_neon(float *dst, const float *src0,
+ const float *src1, const float *win,
+@@ -255,6 +258,11 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+ c->h264_idct_add8 = ff_h264_idct_add8_neon;
+
++ if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) {
++ c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon;
++ c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon;
++ }
++
+ c->vector_fmul = ff_vector_fmul_neon;
+ c->vector_fmul_window = ff_vector_fmul_window_neon;
+
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0009-ARM-actually-add-VP3-loop-filter.patch b/debian/patches/neon/0009-ARM-actually-add-VP3-loop-filter.patch
new file mode 100644
index 0000000..71fbd84
--- /dev/null
+++ b/debian/patches/neon/0009-ARM-actually-add-VP3-loop-filter.patch
@@ -0,0 +1,114 @@
+From 2ca1dc1b533f6916ac593d435e1af8a299944d2c Mon Sep 17 00:00:00 2001
+From: conrad <conrad at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Sat, 23 May 2009 18:47:26 +0000
+Subject: [PATCH 09/27] ARM: actually add VP3 loop filter
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@18917 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/vp3dsp_neon.S | 94 ++++++++++++++++++++++++++++++++++++++++++
+ 1 files changed, 94 insertions(+), 0 deletions(-)
+ create mode 100644 libavcodec/arm/vp3dsp_neon.S
+
+diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
+new file mode 100644
+index 0000000..cce80dd
+--- /dev/null
++++ b/libavcodec/arm/vp3dsp_neon.S
+@@ -0,0 +1,94 @@
++/*
++ * Copyright (c) 2009 David Conrad
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "asm.S"
++
++.macro vp3_loop_filter
++ vsubl.u8 q3, d18, d17
++ vsubl.u8 q2, d16, d19
++ vadd.i16 q1, q3, q3
++ vadd.i16 q2, q2, q3
++ vadd.i16 q0, q1, q2
++ vrshr.s16 q0, q0, #3
++ vmovl.u8 q9, d18
++ vdup.u16 q15, r2
++
++ vabs.s16 q1, q0
++ vshr.s16 q0, q0, #15
++ vqsub.u16 q2, q15, q1
++ vqsub.u16 q3, q2, q1
++ vsub.i16 q1, q2, q3
++ veor q1, q1, q0
++ vsub.i16 q0, q1, q0
++
++ vaddw.u8 q2, q0, d17
++ vsub.i16 q3, q9, q0
++ vqmovun.s16 d0, q2
++ vqmovun.s16 d1, q3
++.endm
++
++function ff_vp3_v_loop_filter_neon, export=1
++ sub ip, r0, r1
++ sub r0, r0, r1, lsl #1
++ vld1.64 {d16}, [r0,:64], r1
++ vld1.64 {d17}, [r0,:64], r1
++ vld1.64 {d18}, [r0,:64], r1
++ vld1.64 {d19}, [r0,:64], r1
++ ldrb r2, [r2, #129*4]
++
++ vp3_loop_filter
++
++ vst1.64 {d0}, [ip,:64], r1
++ vst1.64 {d1}, [ip,:64], r1
++ bx lr
++.endfunc
++
++function ff_vp3_h_loop_filter_neon, export=1
++ sub ip, r0, #1
++ sub r0, r0, #2
++ vld1.32 {d16[]}, [r0], r1
++ vld1.32 {d17[]}, [r0], r1
++ vld1.32 {d18[]}, [r0], r1
++ vld1.32 {d19[]}, [r0], r1
++ vld1.32 {d16[1]}, [r0], r1
++ vld1.32 {d17[1]}, [r0], r1
++ vld1.32 {d18[1]}, [r0], r1
++ vld1.32 {d19[1]}, [r0], r1
++ ldrb r2, [r2, #129*4]
++
++ vtrn.8 d16, d17
++ vtrn.8 d18, d19
++ vtrn.16 d16, d18
++ vtrn.16 d17, d19
++
++ vp3_loop_filter
++
++ vtrn.8 d0, d1
++
++ vst1.16 {d0[0]}, [ip], r1
++ vst1.16 {d1[0]}, [ip], r1
++ vst1.16 {d0[1]}, [ip], r1
++ vst1.16 {d1[1]}, [ip], r1
++ vst1.16 {d0[2]}, [ip], r1
++ vst1.16 {d1[2]}, [ip], r1
++ vst1.16 {d0[3]}, [ip], r1
++ vst1.16 {d1[3]}, [ip], r1
++ bx lr
++.endfunc
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0010-ARM-add-some-PLD-in-NEON-IDCT.patch b/debian/patches/neon/0010-ARM-add-some-PLD-in-NEON-IDCT.patch
new file mode 100644
index 0000000..41febea
--- /dev/null
+++ b/debian/patches/neon/0010-ARM-add-some-PLD-in-NEON-IDCT.patch
@@ -0,0 +1,55 @@
+From 8d8bd2247af043b8b61af8bbc4c131524182531a Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Thu, 28 May 2009 17:19:28 +0000
+Subject: [PATCH 10/27] ARM: add some PLD in NEON IDCT
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@18972 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/simple_idct_neon.S | 17 +++++++++++++++--
+ 1 files changed, 15 insertions(+), 2 deletions(-)
+
+diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
+index e7099a2..0882481 100644
+--- a/libavcodec/arm/simple_idct_neon.S
++++ b/libavcodec/arm/simple_idct_neon.S
+@@ -68,6 +68,19 @@
+ .text
+ .align 6
+
++function idct_row4_pld_neon
++ pld [r0]
++ add r3, r0, r1, lsl #2
++ pld [r0, r1]
++ pld [r0, r1, lsl #1]
++ pld [r3, -r1]
++ pld [r3]
++ pld [r3, r1]
++ add r3, r3, r1, lsl #1
++ pld [r3]
++ pld [r3, r1]
++ .endfunc
++
+ function idct_row4_neon
+ vmov.i32 q15, #(1<<(ROW_SHIFT-1))
+ vld1.64 {d2-d5}, [r2,:128]!
+@@ -252,7 +265,7 @@ idct_coeff_neon:
+ function ff_simple_idct_put_neon, export=1
+ idct_start r2
+
+- bl idct_row4_neon
++ bl idct_row4_pld_neon
+ bl idct_row4_neon
+ add r2, r2, #-128
+ bl idct_col4_neon
+@@ -307,7 +320,7 @@ function idct_col4_add8_neon
+ function ff_simple_idct_add_neon, export=1
+ idct_start r2
+
+- bl idct_row4_neon
++ bl idct_row4_pld_neon
+ bl idct_row4_neon
+ add r2, r2, #-128
+ bl idct_col4_neon
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0011-ARM-slightly-faster-NEON-H264-horizontal-loop-filter.patch b/debian/patches/neon/0011-ARM-slightly-faster-NEON-H264-horizontal-loop-filter.patch
new file mode 100644
index 0000000..2a9b328
--- /dev/null
+++ b/debian/patches/neon/0011-ARM-slightly-faster-NEON-H264-horizontal-loop-filter.patch
@@ -0,0 +1,85 @@
+From 56f78a98e66f1bd2bc29b00a048421e2f1760785 Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Wed, 17 Jun 2009 22:33:04 +0000
+Subject: [PATCH 11/27] ARM: slightly faster NEON H264 horizontal loop filter
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19216 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/h264dsp_neon.S | 49 +++++++++++++++++++++--------------------
+ 1 files changed, 25 insertions(+), 24 deletions(-)
+
+diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
+index 44a1373..03e21f1 100644
+--- a/libavcodec/arm/h264dsp_neon.S
++++ b/libavcodec/arm/h264dsp_neon.S
+@@ -37,6 +37,13 @@
+ vtrn.8 \r6, \r7
+ .endm
+
++ .macro transpose_4x4 r0 r1 r2 r3
++ vtrn.16 \r0, \r2
++ vtrn.16 \r1, \r3
++ vtrn.8 \r0, \r1
++ vtrn.8 \r2, \r3
++ .endm
++
+ .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
+ vswp \r0, \r4
+ vswp \r1, \r5
+@@ -469,35 +476,29 @@ function ff_h264_h_loop_filter_luma_neon, export=1
+ transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
+
+ align_push_regs
+- sub sp, sp, #16
+- vst1.64 {d4, d5}, [sp,:128]
+- sub sp, sp, #16
+- vst1.64 {d20,d21}, [sp,:128]
+
+ h264_loop_filter_luma
+
+- vld1.64 {d20,d21}, [sp,:128]!
+- vld1.64 {d4, d5}, [sp,:128]!
+-
+- transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13
++ transpose_4x4 q4, q8, q0, q5
+
+ sub r0, r0, r1, lsl #4
+- vst1.64 {d6}, [r0], r1
+- vst1.64 {d20}, [r0], r1
+- vst1.64 {d8}, [r0], r1
+- vst1.64 {d16}, [r0], r1
+- vst1.64 {d0}, [r0], r1
+- vst1.64 {d10}, [r0], r1
+- vst1.64 {d4}, [r0], r1
+- vst1.64 {d26}, [r0], r1
+- vst1.64 {d7}, [r0], r1
+- vst1.64 {d21}, [r0], r1
+- vst1.64 {d9}, [r0], r1
+- vst1.64 {d17}, [r0], r1
+- vst1.64 {d1}, [r0], r1
+- vst1.64 {d11}, [r0], r1
+- vst1.64 {d5}, [r0], r1
+- vst1.64 {d27}, [r0], r1
++ add r0, r0, #2
++ vst1.32 {d8[0]}, [r0], r1
++ vst1.32 {d16[0]}, [r0], r1
++ vst1.32 {d0[0]}, [r0], r1
++ vst1.32 {d10[0]}, [r0], r1
++ vst1.32 {d8[1]}, [r0], r1
++ vst1.32 {d16[1]}, [r0], r1
++ vst1.32 {d0[1]}, [r0], r1
++ vst1.32 {d10[1]}, [r0], r1
++ vst1.32 {d9[0]}, [r0], r1
++ vst1.32 {d17[0]}, [r0], r1
++ vst1.32 {d1[0]}, [r0], r1
++ vst1.32 {d11[0]}, [r0], r1
++ vst1.32 {d9[1]}, [r0], r1
++ vst1.32 {d17[1]}, [r0], r1
++ vst1.32 {d1[1]}, [r0], r1
++ vst1.32 {d11[1]}, [r0], r1
+
+ align_pop_regs
+ bx lr
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0012-ARM-enable-fast_unaligned-when-cpu-armv-67-is-specif.patch b/debian/patches/neon/0012-ARM-enable-fast_unaligned-when-cpu-armv-67-is-specif.patch
new file mode 100644
index 0000000..075ba45
--- /dev/null
+++ b/debian/patches/neon/0012-ARM-enable-fast_unaligned-when-cpu-armv-67-is-specif.patch
@@ -0,0 +1,28 @@
+From 8f934d1aa05a5db078dc732b37f2d46faa92e866 Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Tue, 30 Jun 2009 12:46:09 +0000
+Subject: [PATCH 12/27] ARM: enable fast_unaligned when --cpu=armv[67] is specified
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19308 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ configure | 4 ++++
+ 1 files changed, 4 insertions(+), 0 deletions(-)
+
+diff --git a/configure b/configure
+index fd11501..7459215 100755
+--- a/configure
++++ b/configure
+@@ -1792,6 +1792,10 @@ if test $cpu != "generic"; then
+ add_cflags -mcpu=$cpu
+ enable fast_unaligned
+ ;;
++ armv[67]*)
++ add_cflags -march=$cpu
++ enable fast_unaligned
++ ;;
+ armv*)
+ add_cflags -march=$cpu
+ ;;
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0013-ARM-NEON-VP3-IDCT.patch b/debian/patches/neon/0013-ARM-NEON-VP3-IDCT.patch
new file mode 100644
index 0000000..452b35f
--- /dev/null
+++ b/debian/patches/neon/0013-ARM-NEON-VP3-IDCT.patch
@@ -0,0 +1,340 @@
+From 2056fee95d5b017aad91df30e68d1cec96ddfeb6 Mon Sep 17 00:00:00 2001
+From: conrad <conrad at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Sat, 4 Jul 2009 20:41:11 +0000
+Subject: [PATCH 13/27] ARM: NEON VP3 IDCT
+ 15% faster VP3/Theora, 10% faster VP6
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19345 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/dsputil_arm.c | 10 ++
+ libavcodec/arm/vp3dsp_neon.S | 282 ++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 292 insertions(+), 0 deletions(-)
+
+diff --git a/libavcodec/arm/dsputil_arm.c b/libavcodec/arm/dsputil_arm.c
+index c8a277e..c0ab0c9 100644
+--- a/libavcodec/arm/dsputil_arm.c
++++ b/libavcodec/arm/dsputil_arm.c
+@@ -43,6 +43,10 @@ void ff_simple_idct_neon(DCTELEM *data);
+ void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
+ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
+
++void ff_vp3_idct_neon(DCTELEM *data);
++void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
++void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
++
+ /* XXX: local hack */
+ static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
+ static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
+@@ -180,6 +184,12 @@ void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)
+ c->idct_add= ff_simple_idct_add_neon;
+ c->idct = ff_simple_idct_neon;
+ c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
++ } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_THEORA_DECODER) &&
++ idct_algo==FF_IDCT_VP3){
++ c->idct_put= ff_vp3_idct_put_neon;
++ c->idct_add= ff_vp3_idct_add_neon;
++ c->idct = ff_vp3_idct_neon;
++ c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+ #endif
+ }
+ }
+diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
+index cce80dd..b2e16f4 100644
+--- a/libavcodec/arm/vp3dsp_neon.S
++++ b/libavcodec/arm/vp3dsp_neon.S
+@@ -20,6 +20,22 @@
+
+ #include "asm.S"
+
++.section .rodata
++.align 4
++
++vp3_idct_constants:
++.short 64277, 60547, 54491, 46341, 36410, 25080, 12785
++
++#define xC1S7 d0[0]
++#define xC2S6 d0[1]
++#define xC3S5 d0[2]
++#define xC4S4 d0[3]
++#define xC5S3 d1[0]
++#define xC6S2 d1[1]
++#define xC7S1 d1[2]
++
++.text
++
+ .macro vp3_loop_filter
+ vsubl.u8 q3, d18, d17
+ vsubl.u8 q2, d16, d19
+@@ -92,3 +108,269 @@ function ff_vp3_h_loop_filter_neon, export=1
+ vst1.16 {d1[3]}, [ip], r1
+ bx lr
+ .endfunc
++
++
++function vp3_idct_start_neon
++ vpush {d8-d15}
++ movrel r3, vp3_idct_constants
++ vld1.64 {d0-d1}, [r3,:128]
++ vld1.64 {d16-d19}, [r2,:128]!
++ vld1.64 {d20-d23}, [r2,:128]!
++ vld1.64 {d24-d27}, [r2,:128]!
++ vadd.s16 q1, q8, q12
++ vsub.s16 q8, q8, q12
++ vld1.64 {d28-d31}, [r2,:128]!
++.endfunc
++
++function vp3_idct_core_neon
++ vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16
++ vmull.s16 q3, d19, xC1S7
++ vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16
++ vmull.s16 q5, d3, xC4S4
++ vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16
++ vmull.s16 q7, d17, xC4S4
++ vshrn.s32 d4, q2, #16
++ vshrn.s32 d5, q3, #16
++ vshrn.s32 d6, q4, #16
++ vshrn.s32 d7, q5, #16
++ vshrn.s32 d8, q6, #16
++ vshrn.s32 d9, q7, #16
++ vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4
++ vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4
++ vadd.s16 q1, q2, q9 // ip[1] * C1
++
++ vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16
++ vmull.s16 q3, d31, xC1S7
++ vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16
++ vmull.s16 q5, d31, xC7S1
++ vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16
++ vmull.s16 q7, d19, xC7S1
++ vshrn.s32 d4, q2, #16
++ vshrn.s32 d5, q3, #16
++ vshrn.s32 d6, q4, #16 // ip[7] * C7
++ vshrn.s32 d7, q5, #16
++ vshrn.s32 d8, q6, #16 // ip[1] * C7
++ vshrn.s32 d9, q7, #16
++ vadd.s16 q2, q2, q15 // ip[7] * C1
++ vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7
++ vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1
++
++ vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16
++ vmull.s16 q3, d23, xC5S3
++ vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16
++ vmull.s16 q5, d23, xC3S5
++ vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16
++ vmull.s16 q7, d27, xC5S3
++ vshrn.s32 d4, q2, #16
++ vshrn.s32 d5, q3, #16
++ vshrn.s32 d6, q4, #16
++ vshrn.s32 d7, q5, #16
++ vshrn.s32 d8, q6, #16
++ vshrn.s32 d9, q7, #16
++ vadd.s16 q3, q3, q11 // ip[3] * C3
++ vadd.s16 q4, q4, q13 // ip[5] * C5
++ vadd.s16 q1, q2, q11 // ip[3] * C5
++ vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5
++
++ vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16
++ vmull.s16 q3, d27, xC3S5
++ vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16
++ vmull.s16 q5, d21, xC2S6
++ vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16
++ vmull.s16 q7, d29, xC6S2
++ vshrn.s32 d4, q2, #16
++ vshrn.s32 d5, q3, #16
++ vshrn.s32 d6, q4, #16
++ vshrn.s32 d7, q5, #16
++ vshrn.s32 d8, q6, #16 // ip[6] * C6
++ vshrn.s32 d9, q7, #16
++ vadd.s16 q2, q2, q13 // ip[5] * C3
++ vadd.s16 q3, q3, q10 // ip[2] * C2
++ vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5
++ vsub.s16 q1, q9, q11 // (A - C)
++ vadd.s16 q11, q9, q11 // Cd = A + C
++ vsub.s16 q9, q15, q13 // (B - D)
++ vadd.s16 q13, q15, q13 // Dd = B + D
++ vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6
++
++ vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16
++ vmull.s16 q3, d3, xC4S4
++ vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16
++ vmull.s16 q5, d29, xC2S6
++ vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16
++ vmull.s16 q7, d21, xC6S2
++ vshrn.s32 d4, q2, #16
++ vshrn.s32 d5, q3, #16
++ vshrn.s32 d6, q4, #16
++ vshrn.s32 d7, q5, #16
++ vshrn.s32 d8, q6, #16 // ip[2] * C6
++ vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16
++ vmull.s16 q6, d19, xC4S4
++ vshrn.s32 d9, q7, #16
++ vadd.s16 q3, q3, q14 // ip[6] * C2
++ vadd.s16 q10, q1, q2 // Ad = (A - C) * C4
++ vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2
++ bx lr
++.endfunc
++
++.macro VP3_IDCT_END type
++function vp3_idct_end_\type\()_neon
++.ifc \type, col
++ vdup.16 q0, r3
++ vadd.s16 q12, q12, q0
++ vadd.s16 q8, q8, q0
++.endif
++
++ vshrn.s32 d2, q5, #16
++ vshrn.s32 d3, q6, #16
++ vadd.s16 q2, q12, q15 // Gd = E + G
++ vadd.s16 q9, q1, q9 // (B - D) * C4
++ vsub.s16 q12, q12, q15 // Ed = E - G
++ vsub.s16 q3, q8, q10 // Fd = F - Ad
++ vadd.s16 q10, q8, q10 // Add = F + Ad
++ vadd.s16 q4, q9, q14 // Hd = Bd + H
++ vsub.s16 q14, q9, q14 // Bdd = Bd - H
++ vadd.s16 q8, q2, q11 // [0] = Gd + Cd
++ vsub.s16 q15, q2, q11 // [7] = Gd - Cd
++ vadd.s16 q9, q10, q4 // [1] = Add + Hd
++ vsub.s16 q10, q10, q4 // [2] = Add - Hd
++ vadd.s16 q11, q12, q13 // [3] = Ed + Dd
++ vsub.s16 q12, q12, q13 // [4] = Ed - Dd
++.ifc \type, row
++ vtrn.16 q8, q9
++.endif
++ vadd.s16 q13, q3, q14 // [5] = Fd + Bdd
++ vsub.s16 q14, q3, q14 // [6] = Fd - Bdd
++
++.ifc \type, row
++ // 8x8 transpose
++ vtrn.16 q10, q11
++ vtrn.16 q12, q13
++ vtrn.16 q14, q15
++ vtrn.32 q8, q10
++ vtrn.32 q9, q11
++ vtrn.32 q12, q14
++ vtrn.32 q13, q15
++ vswp d17, d24
++ vswp d19, d26
++ vadd.s16 q1, q8, q12
++ vswp d21, d28
++ vsub.s16 q8, q8, q12
++ vswp d23, d30
++.endif
++ bx lr
++.endfunc
++.endm
++
++VP3_IDCT_END row
++VP3_IDCT_END col
++
++function ff_vp3_idct_neon, export=1
++ mov ip, lr
++ mov r2, r0
++ bl vp3_idct_start_neon
++ bl vp3_idct_end_row_neon
++ mov r3, #8
++ bl vp3_idct_core_neon
++ bl vp3_idct_end_col_neon
++ mov lr, ip
++ vpop {d8-d15}
++
++ vshr.s16 q8, q8, #4
++ vshr.s16 q9, q9, #4
++ vshr.s16 q10, q10, #4
++ vshr.s16 q11, q11, #4
++ vshr.s16 q12, q12, #4
++ vst1.64 {d16-d19}, [r0,:128]!
++ vshr.s16 q13, q13, #4
++ vshr.s16 q14, q14, #4
++ vst1.64 {d20-d23}, [r0,:128]!
++ vshr.s16 q15, q15, #4
++ vst1.64 {d24-d27}, [r0,:128]!
++ vst1.64 {d28-d31}, [r0,:128]!
++ bx lr
++.endfunc
++
++function ff_vp3_idct_put_neon, export=1
++ mov ip, lr
++ bl vp3_idct_start_neon
++ bl vp3_idct_end_row_neon
++ mov r3, #8
++ add r3, r3, #2048 // convert signed pixel to unsigned
++ bl vp3_idct_core_neon
++ bl vp3_idct_end_col_neon
++ mov lr, ip
++ vpop {d8-d15}
++
++ vqshrun.s16 d0, q8, #4
++ vqshrun.s16 d1, q9, #4
++ vqshrun.s16 d2, q10, #4
++ vqshrun.s16 d3, q11, #4
++ vst1.64 {d0}, [r0,:64], r1
++ vqshrun.s16 d4, q12, #4
++ vst1.64 {d1}, [r0,:64], r1
++ vqshrun.s16 d5, q13, #4
++ vst1.64 {d2}, [r0,:64], r1
++ vqshrun.s16 d6, q14, #4
++ vst1.64 {d3}, [r0,:64], r1
++ vqshrun.s16 d7, q15, #4
++ vst1.64 {d4}, [r0,:64], r1
++ vst1.64 {d5}, [r0,:64], r1
++ vst1.64 {d6}, [r0,:64], r1
++ vst1.64 {d7}, [r0,:64], r1
++ bx lr
++.endfunc
++
++function ff_vp3_idct_add_neon, export=1
++ mov ip, lr
++ bl vp3_idct_start_neon
++ bl vp3_idct_end_row_neon
++ mov r3, #8
++ bl vp3_idct_core_neon
++ bl vp3_idct_end_col_neon
++ mov lr, ip
++ vpop {d8-d15}
++ mov r2, r0
++
++ vld1.64 {d0}, [r0,:64], r1
++ vshr.s16 q8, q8, #4
++ vld1.64 {d1}, [r0,:64], r1
++ vshr.s16 q9, q9, #4
++ vld1.64 {d2}, [r0,:64], r1
++ vaddw.u8 q8, q8, d0
++ vld1.64 {d3}, [r0,:64], r1
++ vaddw.u8 q9, q9, d1
++ vld1.64 {d4}, [r0,:64], r1
++ vshr.s16 q10, q10, #4
++ vld1.64 {d5}, [r0,:64], r1
++ vshr.s16 q11, q11, #4
++ vld1.64 {d6}, [r0,:64], r1
++ vqmovun.s16 d0, q8
++ vld1.64 {d7}, [r0,:64], r1
++ vqmovun.s16 d1, q9
++ vaddw.u8 q10, q10, d2
++ vaddw.u8 q11, q11, d3
++ vshr.s16 q12, q12, #4
++ vshr.s16 q13, q13, #4
++ vqmovun.s16 d2, q10
++ vqmovun.s16 d3, q11
++ vaddw.u8 q12, q12, d4
++ vaddw.u8 q13, q13, d5
++ vshr.s16 q14, q14, #4
++ vshr.s16 q15, q15, #4
++ vst1.64 {d0}, [r2,:64], r1
++ vqmovun.s16 d4, q12
++ vst1.64 {d1}, [r2,:64], r1
++ vqmovun.s16 d5, q13
++ vst1.64 {d2}, [r2,:64], r1
++ vaddw.u8 q14, q14, d6
++ vst1.64 {d3}, [r2,:64], r1
++ vaddw.u8 q15, q15, d7
++ vst1.64 {d4}, [r2,:64], r1
++ vqmovun.s16 d6, q14
++ vst1.64 {d5}, [r2,:64], r1
++ vqmovun.s16 d7, q15
++ vst1.64 {d6}, [r2,:64], r1
++ vst1.64 {d7}, [r2,:64], r1
++ bx lr
++.endfunc
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0014-ARM-NEON-optimised-vorbis_inverse_coupling.patch b/debian/patches/neon/0014-ARM-NEON-optimised-vorbis_inverse_coupling.patch
new file mode 100644
index 0000000..b8fcdbd
--- /dev/null
+++ b/debian/patches/neon/0014-ARM-NEON-optimised-vorbis_inverse_coupling.patch
@@ -0,0 +1,116 @@
+From d32e115cb8c43fe4531567b1c668dba6dc76274d Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Fri, 14 Aug 2009 01:02:06 +0000
+Subject: [PATCH 14/27] ARM: NEON optimised vorbis_inverse_coupling
+
+12% faster Vorbis decoding on Cortex-A8.
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19637 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/dsputil_neon.c | 5 +++
+ libavcodec/arm/dsputil_neon_s.S | 64 +++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 69 insertions(+), 0 deletions(-)
+
+diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
+index 20425c1..eb9aba1 100644
+--- a/libavcodec/arm/dsputil_neon.c
++++ b/libavcodec/arm/dsputil_neon.c
+@@ -161,6 +161,8 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0,
+ void ff_float_to_int16_neon(int16_t *, const float *, long);
+ void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+
++void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
++
+ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+ {
+ c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
+@@ -270,4 +272,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+ c->float_to_int16 = ff_float_to_int16_neon;
+ c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
+ }
++
++ if (CONFIG_VORBIS_DECODER)
++ c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
+ }
+diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
+index 303b11c..2bc07fa 100644
+--- a/libavcodec/arm/dsputil_neon_s.S
++++ b/libavcodec/arm/dsputil_neon_s.S
+@@ -19,6 +19,7 @@
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
++#include "config.h"
+ #include "asm.S"
+
+ preserve8
+@@ -793,3 +794,66 @@ function ff_vector_fmul_window_neon, export=1
+ vst1.64 {d22,d23},[ip,:128], r5
+ pop {r4,r5,pc}
+ .endfunc
++
++#if CONFIG_VORBIS_DECODER
++function ff_vorbis_inverse_coupling_neon, export=1
++ vmov.i32 q10, #1<<31
++ subs r2, r2, #4
++ tst r2, #4
++ mov r3, r0
++ mov r12, r1
++ beq 3f
++
++ vld1.32 {d24-d25},[r1,:128]!
++ vld1.32 {d22-d23},[r0,:128]!
++ vcle.s32 q8, q12, #0
++ vand q9, q11, q10
++ veor q12, q12, q9
++ vand q2, q12, q8
++ vbic q3, q12, q8
++ vadd.f32 q12, q11, q2
++ vsub.f32 q11, q11, q3
++1: vld1.32 {d2-d3}, [r1,:128]!
++ vld1.32 {d0-d1}, [r0,:128]!
++ vcle.s32 q8, q1, #0
++ vand q9, q0, q10
++ veor q1, q1, q9
++ vst1.32 {d24-d25},[r3, :128]!
++ vst1.32 {d22-d23},[r12,:128]!
++ vand q2, q1, q8
++ vbic q3, q1, q8
++ vadd.f32 q1, q0, q2
++ vsub.f32 q0, q0, q3
++ subs r2, r2, #8
++ ble 2f
++ vld1.32 {d24-d25},[r1,:128]!
++ vld1.32 {d22-d23},[r0,:128]!
++ vcle.s32 q8, q12, #0
++ vand q9, q11, q10
++ veor q12, q12, q9
++ vst1.32 {d2-d3}, [r3, :128]!
++ vst1.32 {d0-d1}, [r12,:128]!
++ vand q2, q12, q8
++ vbic q3, q12, q8
++ vadd.f32 q12, q11, q2
++ vsub.f32 q11, q11, q3
++ b 1b
++
++2: vst1.32 {d2-d3}, [r3, :128]!
++ vst1.32 {d0-d1}, [r12,:128]!
++ bxlt lr
++
++3: vld1.32 {d2-d3}, [r1,:128]
++ vld1.32 {d0-d1}, [r0,:128]
++ vcle.s32 q8, q1, #0
++ vand q9, q0, q10
++ veor q1, q1, q9
++ vand q2, q1, q8
++ vbic q3, q1, q8
++ vadd.f32 q1, q0, q2
++ vsub.f32 q0, q0, q3
++ vst1.32 {d2-d3}, [r0,:128]!
++ vst1.32 {d0-d1}, [r1,:128]!
++ bx lr
++ .endfunc
++#endif
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0015-ARM-handle-VFP-register-arguments-in-ff_vector_fmul_.patch b/debian/patches/neon/0015-ARM-handle-VFP-register-arguments-in-ff_vector_fmul_.patch
new file mode 100644
index 0000000..a6fca23
--- /dev/null
+++ b/debian/patches/neon/0015-ARM-handle-VFP-register-arguments-in-ff_vector_fmul_.patch
@@ -0,0 +1,31 @@
+From e965241fea3348a8205bd38f91efbfcd13e8cd31 Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Mon, 20 Jul 2009 22:30:27 +0000
+Subject: [PATCH 15/27] ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19475 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/dsputil_neon_s.S | 6 ++++--
+ 1 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
+index 2bc07fa..71d09c6 100644
+--- a/libavcodec/arm/dsputil_neon_s.S
++++ b/libavcodec/arm/dsputil_neon_s.S
+@@ -746,9 +746,11 @@ function ff_vector_fmul_neon, export=1
+ .endfunc
+
+ function ff_vector_fmul_window_neon, export=1
+- vld1.32 {d16[],d17[]}, [sp,:32]
++VFP vdup.32 q8, d0[0]
++NOVFP vld1.32 {d16[],d17[]}, [sp,:32]
+ push {r4,r5,lr}
+- ldr lr, [sp, #16]
++VFP ldr lr, [sp, #12]
++NOVFP ldr lr, [sp, #16]
+ sub r2, r2, #8
+ sub r5, lr, #2
+ add r2, r2, r5, lsl #2
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0016-ARM-NEON-optimised-FFT-and-MDCT.patch b/debian/patches/neon/0016-ARM-NEON-optimised-FFT-and-MDCT.patch
new file mode 100644
index 0000000..082a7c1
--- /dev/null
+++ b/debian/patches/neon/0016-ARM-NEON-optimised-FFT-and-MDCT.patch
@@ -0,0 +1,664 @@
+From 2d316e271f8837e0888b664ae45f6174f827a982 Mon Sep 17 00:00:00 2001
+From: =?utf-8?q?Lo=C3=AFc=20Minier?= <lool at dooz.org>
+Date: Sat, 10 Oct 2009 12:58:17 +0200
+Subject: [PATCH 16/27] ARM: NEON optimised FFT and MDCT
+
+Vorbis and AC3 ~3x faster.
+
+Parts by Naotoshi Nojiri, naonoj gmail
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19806 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+
+Conflicts:
+
+ libavcodec/Makefile
+---
+ libavcodec/Makefile | 5 +
+ libavcodec/arm/fft_neon.S | 369 ++++++++++++++++++++++++++++++++++++++++++++
+ libavcodec/arm/mdct_neon.S | 178 +++++++++++++++++++++
+ libavcodec/dsputil.h | 4 +
+ libavcodec/fft.c | 10 +-
+ 5 files changed, 565 insertions(+), 1 deletions(-)
+ create mode 100644 libavcodec/arm/fft_neon.S
+ create mode 100644 libavcodec/arm/mdct_neon.S
+
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index a213309..02e0e8a 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -480,6 +480,11 @@ OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
+ arm/mpegvideo_iwmmxt.o \
+
+ NEON-OBJS-$(CONFIG_THEORA_DECODER) += arm/vp3dsp_neon.o
++
++NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
++
++NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \
++
+ NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o
+
+ OBJS-$(HAVE_NEON) += arm/dsputil_neon.o \
+diff --git a/libavcodec/arm/fft_neon.S b/libavcodec/arm/fft_neon.S
+new file mode 100644
+index 0000000..6ed5789
+--- /dev/null
++++ b/libavcodec/arm/fft_neon.S
+@@ -0,0 +1,369 @@
++/*
++ * ARM NEON optimised FFT
++ *
++ * Copyright (c) 2009 Mans Rullgard <mans at mansr.com>
++ * Copyright (c) 2009 Naotoshi Nojiri
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "asm.S"
++
++#define M_SQRT1_2 0.70710678118654752440
++
++ .text
++
++function fft4_neon
++ vld1.32 {d0-d3}, [r0,:128]
++
++ vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2
++ vsub.f32 d6, d0, d1 @ r0-r1,i0-i1
++ vsub.f32 d7, d16, d17 @ r3-r2,i2-i3
++ vadd.f32 d4, d0, d1 @ r0+r1,i0+i1
++ vadd.f32 d5, d2, d3 @ i2+i3,r2+r3
++ vadd.f32 d1, d6, d7
++ vsub.f32 d3, d6, d7
++ vadd.f32 d0, d4, d5
++ vsub.f32 d2, d4, d5
++
++ vst1.32 {d0-d3}, [r0,:128]
++
++ bx lr
++.endfunc
++
++function fft8_neon
++ mov r1, r0
++ vld1.32 {d0-d3}, [r1,:128]!
++ vld1.32 {d16-d19}, [r1,:128]
++
++ movw r2, #0x04f3 @ sqrt(1/2)
++ movt r2, #0x3f35
++ eor r3, r2, #1<<31
++ vdup.32 d31, r2
++
++ vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2
++ vadd.f32 d4, d16, d17 @ r4+r5,i4+i5
++ vmov d28, r3, r2
++ vadd.f32 d5, d18, d19 @ r6+r7,i6+i7
++ vsub.f32 d17, d16, d17 @ r4-r5,i4-i5
++ vsub.f32 d19, d18, d19 @ r6-r7,i6-i7
++ vrev64.32 d29, d28
++ vadd.f32 d20, d0, d1 @ r0+r1,i0+i1
++ vadd.f32 d21, d2, d3 @ r2+r3,i2+i3
++ vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w
++ vext.32 q3, q2, q2, #1
++ vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w
++ vsub.f32 d23, d22, d23 @ i2-i3,r3-r2
++ vsub.f32 d22, d0, d1 @ r0-r1,i0-i1
++ vmul.f32 d24, d17, d31 @ a2r*w,a2i*w
++ vmul.f32 d25, d19, d31 @ a3r*w,a3i*w
++ vadd.f32 d0, d20, d21
++ vsub.f32 d2, d20, d21
++ vadd.f32 d1, d22, d23
++ vrev64.32 q13, q13
++ vsub.f32 d3, d22, d23
++ vsub.f32 d6, d6, d7
++ vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2
++ vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6
++ vadd.f32 d7, d4, d5
++ vsub.f32 d18, d2, d6
++ vext.32 q13, q12, q12, #1
++ vadd.f32 d2, d2, d6
++ vsub.f32 d16, d0, d7
++ vadd.f32 d5, d25, d24
++ vsub.f32 d4, d26, d27
++ vadd.f32 d0, d0, d7
++ vsub.f32 d17, d1, d5
++ vsub.f32 d19, d3, d4
++ vadd.f32 d3, d3, d4
++ vadd.f32 d1, d1, d5
++
++ vst1.32 {d16-d19}, [r1,:128]
++ vst1.32 {d0-d3}, [r0,:128]
++
++ bx lr
++.endfunc
++
++function fft16_neon
++ movrel r1, mppm
++ vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
++ pld [r0, #32]
++ vld1.32 {d2-d3}, [r1,:128]
++ vext.32 q13, q9, q9, #1
++ vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
++ vadd.f32 d4, d16, d17
++ vsub.f32 d5, d16, d17
++ vadd.f32 d18, d18, d19
++ vsub.f32 d19, d26, d27
++
++ vadd.f32 d20, d22, d23
++ vsub.f32 d22, d22, d23
++ vsub.f32 d23, d24, d25
++ vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1}
++ vadd.f32 d21, d24, d25
++ vmul.f32 d24, d22, d2
++ vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3}
++ vmul.f32 d25, d23, d3
++ vuzp.32 d16, d17 @ {r0,r1,i0,i1}
++ vmul.f32 q1, q11, d2[1]
++ vuzp.32 d18, d19 @ {r2,r3,i2,i3}
++ vrev64.32 q12, q12
++ vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6}
++ vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
++ vzip.32 q10, q11
++ vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
++ vadd.f32 d0, d22, d20
++ vadd.f32 d1, d21, d23
++ vsub.f32 d2, d21, d23
++ vsub.f32 d3, d22, d20
++ sub r0, r0, #96
++ vext.32 q13, q13, q13, #1
++ vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5}
++ vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
++ vext.32 q15, q15, q15, #1
++ vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7}
++ vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
++ vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3}
++ vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
++ vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
++ vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
++ movrel r2, ff_cos_16
++ vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
++ vrev64.32 d1, d1
++ vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
++ vrev64.32 d3, d3
++ movrel r3, pmmp
++ vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
++ vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
++ vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
++ vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13}
++ vld1.32 {d4-d5}, [r2,:64]
++ vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11}
++ vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15}
++ vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
++ vld1.32 {d6-d7}, [r3,:128]
++ vrev64.32 q1, q14
++ vmul.f32 q14, q14, d4[1]
++ vmul.f32 q1, q1, q3
++ vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a}
++ vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
++ vzip.32 q12, q14
++ vadd.f32 d0, d28, d24
++ vadd.f32 d1, d25, d29
++ vsub.f32 d2, d25, d29
++ vsub.f32 d3, d28, d24
++ vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9}
++ vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
++ vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13}
++ mov r1, #32
++ vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5}
++ vrev64.32 q0, q13
++ vmul.f32 q13, q13, d5[0]
++ vrev64.32 q1, q15
++ vmul.f32 q15, q15, d5[1]
++ vst2.32 {d16-d17},[r0,:128], r1
++ vmul.f32 q0, q0, q3
++ vst2.32 {d20-d21},[r0,:128], r1
++ vmul.f32 q1, q1, q3
++ vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6}
++ vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a}
++ vst2.32 {d24-d25},[r0,:128], r1
++ vst2.32 {d28-d29},[r0,:128]
++ vzip.32 q13, q15
++ sub r0, r0, #80
++ vadd.f32 d0, d30, d26
++ vadd.f32 d1, d27, d31
++ vsub.f32 d2, d27, d31
++ vsub.f32 d3, d30, d26
++ vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11}
++ vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3}
++ vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15}
++ vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7}
++ vst2.32 {d18-d19},[r0,:128], r1
++ vst2.32 {d22-d23},[r0,:128], r1
++ vst2.32 {d26-d27},[r0,:128], r1
++ vst2.32 {d30-d31},[r0,:128]
++ bx lr
++.endfunc
++
++function fft_pass_neon
++ push {r4-r6,lr}
++ mov r6, r2 @ n
++ lsl r5, r2, #3 @ 2 * n * sizeof FFTSample
++ lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex
++ lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex
++ add r3, r2, r4
++ add r4, r4, r0 @ &z[o1]
++ add r2, r2, r0 @ &z[o2]
++ add r3, r3, r0 @ &z[o3]
++ vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
++ movrel r12, pmmp
++ vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
++ add r5, r5, r1 @ wim
++ vld1.32 {d6-d7}, [r12,:128] @ pmmp
++ vswp d21, d22
++ vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]}
++ sub r5, r5, #4 @ wim--
++ vrev64.32 q1, q11
++ vmul.f32 q11, q11, d4[1]
++ vmul.f32 q1, q1, q3
++ vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1]
++ vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
++ vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
++ sub r6, r6, #1 @ n--
++ vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
++ vzip.32 q10, q11
++ vadd.f32 d0, d22, d20
++ vadd.f32 d1, d21, d23
++ vsub.f32 d2, d21, d23
++ vsub.f32 d3, d22, d20
++ vsub.f32 q10, q8, q0
++ vadd.f32 q8, q8, q0
++ vsub.f32 q11, q9, q1
++ vadd.f32 q9, q9, q1
++ vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]}
++ vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]}
++ vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]}
++ vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]}
++ sub r5, r5, #8 @ wim -= 2
++1:
++ vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
++ vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
++ vswp d21, d22
++ vld1.32 {d4}, [r1]! @ {wre[0],wre[1]}
++ vrev64.32 q0, q10
++ vmul.f32 q10, q10, d4[0]
++ vrev64.32 q1, q11
++ vmul.f32 q11, q11, d4[1]
++ vld1.32 {d5}, [r5] @ {wim[-1],wim[0]}
++ vmul.f32 q0, q0, q3
++ sub r5, r5, #8 @ wim -= 2
++ vmul.f32 q1, q1, q3
++ vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6}
++ vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
++ vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
++ subs r6, r6, #1 @ n--
++ vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
++ vzip.32 q10, q11
++ vadd.f32 d0, d22, d20
++ vadd.f32 d1, d21, d23
++ vsub.f32 d2, d21, d23
++ vsub.f32 d3, d22, d20
++ vsub.f32 q10, q8, q0
++ vadd.f32 q8, q8, q0
++ vsub.f32 q11, q9, q1
++ vadd.f32 q9, q9, q1
++ vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]}
++ vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]}
++ vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]}
++ vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]}
++ bne 1b
++
++ pop {r4-r6,pc}
++.endfunc
++
++.macro def_fft n, n2, n4
++ .align 6
++function fft\n\()_neon
++ push {r4, lr}
++ mov r4, r0
++ bl fft\n2\()_neon
++ add r0, r4, #\n4*2*8
++ bl fft\n4\()_neon
++ add r0, r4, #\n4*3*8
++ bl fft\n4\()_neon
++ mov r0, r4
++ pop {r4, lr}
++ movrel r1, ff_cos_\n
++ mov r2, #\n4/2
++ b fft_pass_neon
++.endfunc
++.endm
++
++ def_fft 32, 16, 8
++ def_fft 64, 32, 16
++ def_fft 128, 64, 32
++ def_fft 256, 128, 64
++ def_fft 512, 256, 128
++ def_fft 1024, 512, 256
++ def_fft 2048, 1024, 512
++ def_fft 4096, 2048, 1024
++ def_fft 8192, 4096, 2048
++ def_fft 16384, 8192, 4096
++ def_fft 32768, 16384, 8192
++ def_fft 65536, 32768, 16384
++
++function ff_fft_calc_neon, export=1
++ ldr r2, [r0]
++ sub r2, r2, #2
++ movrel r3, fft_tab_neon
++ ldr r3, [r3, r2, lsl #2]
++ mov r0, r1
++ bx r3
++.endfunc
++
++function ff_fft_permute_neon, export=1
++ push {r4,lr}
++ mov r12, #1
++ ldr r2, [r0] @ nbits
++ ldr r3, [r0, #20] @ tmp_buf
++ ldr r0, [r0, #8] @ revtab
++ lsl r12, r12, r2
++ mov r2, r12
++1:
++ vld1.32 {d0-d1}, [r1,:128]!
++ ldr r4, [r0], #4
++ uxtah lr, r3, r4
++ uxtah r4, r3, r4, ror #16
++ vst1.32 {d0}, [lr,:64]
++ vst1.32 {d1}, [r4,:64]
++ subs r12, r12, #2
++ bgt 1b
++
++ sub r1, r1, r2, lsl #3
++1:
++ vld1.32 {d0-d3}, [r3,:128]!
++ vst1.32 {d0-d3}, [r1,:128]!
++ subs r2, r2, #4
++ bgt 1b
++
++ pop {r4,pc}
++.endfunc
++
++ .section .rodata
++ .align 4
++fft_tab_neon:
++ .word fft4_neon
++ .word fft8_neon
++ .word fft16_neon
++ .word fft32_neon
++ .word fft64_neon
++ .word fft128_neon
++ .word fft256_neon
++ .word fft512_neon
++ .word fft1024_neon
++ .word fft2048_neon
++ .word fft4096_neon
++ .word fft8192_neon
++ .word fft16384_neon
++ .word fft32768_neon
++ .word fft65536_neon
++ .size fft_tab_neon, . - fft_tab_neon
++
++ .align 4
++pmmp: .float +1.0, -1.0, -1.0, +1.0
++mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
+new file mode 100644
+index 0000000..6d1dcfd
+--- /dev/null
++++ b/libavcodec/arm/mdct_neon.S
+@@ -0,0 +1,178 @@
++/*
++ * ARM NEON optimised MDCT
++ * Copyright (c) 2009 Mans Rullgard <mans at mansr.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "asm.S"
++
++ .fpu neon
++ .text
++
++function ff_imdct_half_neon, export=1
++ push {r4-r8,lr}
++
++ mov r12, #1
++ ldr lr, [r0, #4] @ nbits
++ ldr r4, [r0, #8] @ tcos
++ ldr r5, [r0, #12] @ tsin
++ ldr r3, [r0, #24] @ revtab
++ lsl r12, r12, lr @ n = 1 << nbits
++ lsr lr, r12, #2 @ n4 = n >> 2
++ add r7, r2, r12, lsl #1
++ mov r12, #-16
++ sub r7, r7, #16
++
++ vld1.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
++ vld1.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
++ vld1.32 {d2}, [r4,:64]! @ d2=c0,c1
++ vld1.32 {d3}, [r5,:64]! @ d3=s0,s1
++ vuzp.32 d17, d16
++ vuzp.32 d0, d1
++ vmul.f32 d6, d16, d2
++ vmul.f32 d7, d0, d2
++1:
++ subs lr, lr, #2
++ ldr r6, [r3], #4
++ vmul.f32 d4, d0, d3
++ vmul.f32 d5, d16, d3
++ vsub.f32 d4, d6, d4
++ vadd.f32 d5, d5, d7
++ uxtah r8, r1, r6, ror #16
++ uxtah r6, r1, r6
++ beq 1f
++ vld1.32 {d16-d17},[r7,:128],r12
++ vld1.32 {d0-d1}, [r2,:128]!
++ vuzp.32 d17, d16
++ vld1.32 {d2}, [r4,:64]!
++ vuzp.32 d0, d1
++ vmul.f32 d6, d16, d2
++ vld1.32 {d3}, [r5,:64]!
++ vmul.f32 d7, d0, d2
++ vst2.32 {d4[0],d5[0]}, [r6,:64]
++ vst2.32 {d4[1],d5[1]}, [r8,:64]
++ b 1b
++1:
++ vst2.32 {d4[0],d5[0]}, [r6,:64]
++ vst2.32 {d4[1],d5[1]}, [r8,:64]
++
++ mov r4, r0
++ mov r6, r1
++ add r0, r0, #16
++ bl ff_fft_calc_neon
++
++ mov r12, #1
++ ldr lr, [r4, #4] @ nbits
++ ldr r5, [r4, #12] @ tsin
++ ldr r4, [r4, #8] @ tcos
++ lsl r12, r12, lr @ n = 1 << nbits
++ lsr lr, r12, #3 @ n8 = n >> 3
++
++ add r4, r4, lr, lsl #2
++ add r5, r5, lr, lsl #2
++ add r6, r6, lr, lsl #3
++ sub r1, r4, #8
++ sub r2, r5, #8
++ sub r3, r6, #16
++
++ mov r7, #-16
++ mov r12, #-8
++ mov r8, r6
++ mov r0, r3
++
++ vld1.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
++ vld1.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
++ vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
++ vuzp.32 d20, d21
++ vuzp.32 d0, d1
++1:
++ subs lr, lr, #2
++ vmul.f32 d7, d0, d18
++ vld1.32 {d19}, [r5,:64]! @ d19=s2,s3
++ vmul.f32 d4, d1, d18
++ vld1.32 {d16}, [r1,:64], r12 @ d16=c1,c0
++ vmul.f32 d5, d21, d19
++ vld1.32 {d17}, [r4,:64]! @ d17=c2,c3
++ vmul.f32 d6, d20, d19
++ vmul.f32 d22, d1, d16
++ vmul.f32 d23, d21, d17
++ vmul.f32 d24, d0, d16
++ vmul.f32 d25, d20, d17
++ vadd.f32 d7, d7, d22
++ vadd.f32 d6, d6, d23
++ vsub.f32 d4, d4, d24
++ vsub.f32 d5, d5, d25
++ beq 1f
++ vld1.32 {d0-d1}, [r3,:128], r7
++ vld1.32 {d20-d21},[r6,:128]!
++ vld1.32 {d18}, [r2,:64], r12
++ vuzp.32 d20, d21
++ vuzp.32 d0, d1
++ vrev64.32 q3, q3
++ vtrn.32 d4, d6
++ vtrn.32 d5, d7
++ vswp d5, d6
++ vst1.32 {d4-d5}, [r0,:128], r7
++ vst1.32 {d6-d7}, [r8,:128]!
++ b 1b
++1:
++ vrev64.32 q3, q3
++ vtrn.32 d4, d6
++ vtrn.32 d5, d7
++ vswp d5, d6
++ vst1.32 {d4-d5}, [r0,:128]
++ vst1.32 {d6-d7}, [r8,:128]
++
++ pop {r4-r8,pc}
++.endfunc
++
++function ff_imdct_calc_neon, export=1
++ push {r4-r6,lr}
++
++ ldr r3, [r0, #4]
++ mov r4, #1
++ mov r5, r1
++ lsl r4, r4, r3
++ add r1, r1, r4
++
++ bl ff_imdct_half_neon
++
++ add r0, r5, r4, lsl #2
++ add r1, r5, r4, lsl #1
++ sub r0, r0, #8
++ sub r2, r1, #16
++ mov r3, #-16
++ mov r6, #-8
++ vmov.i32 d30, #1<<31
++1:
++ vld1.32 {d0-d1}, [r2,:128], r3
++ pld [r0, #-16]
++ vrev64.32 q0, q0
++ vld1.32 {d2-d3}, [r1,:128]!
++ veor d4, d1, d30
++ pld [r2, #-16]
++ vrev64.32 q1, q1
++ veor d5, d0, d30
++ vst1.32 {d2}, [r0,:64], r6
++ vst1.32 {d3}, [r0,:64], r6
++ vst1.32 {d4-d5}, [r5,:128]!
++ subs r4, r4, #16
++ bgt 1b
++
++ pop {r4-r6,pc}
++.endfunc
+diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
+index e9c2bfc..c4abd14 100644
+--- a/libavcodec/dsputil.h
++++ b/libavcodec/dsputil.h
+@@ -692,11 +692,13 @@ extern FFTSample* ff_cos_tabs[13];
+ int ff_fft_init(FFTContext *s, int nbits, int inverse);
+ void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
+ void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
++void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
+ void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
+ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
+ void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
+ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
+ void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
++void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+
+ /**
+ * Do the permutation needed BEFORE calling ff_fft_calc().
+@@ -766,6 +768,8 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *inpu
+ void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
++void ff_imdct_calc_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
++void ff_imdct_half_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input);
+ void ff_mdct_end(MDCTContext *s);
+
+diff --git a/libavcodec/fft.c b/libavcodec/fft.c
+index 296f634..b05ddda 100644
+--- a/libavcodec/fft.c
++++ b/libavcodec/fft.c
+@@ -64,6 +64,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
+ float alpha, c1, s1, s2;
+ int split_radix = 1;
+ int av_unused has_vectors;
++ int revtab_shift = 0;
+
+ if (nbits < 2 || nbits > 16)
+ goto fail;
+@@ -112,6 +113,12 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
+ s->fft_calc = ff_fft_calc_altivec;
+ split_radix = 0;
+ }
++#elif HAVE_NEON
++ s->fft_permute = ff_fft_permute_neon;
++ s->fft_calc = ff_fft_calc_neon;
++ s->imdct_calc = ff_imdct_calc_neon;
++ s->imdct_half = ff_imdct_half_neon;
++ revtab_shift = 3;
+ #endif
+
+ if (split_radix) {
+@@ -125,7 +132,8 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
+ tab[m/2-i] = tab[i];
+ }
+ for(i=0; i<n; i++)
+- s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i;
++ s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] =
++ i << revtab_shift;
+ s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
+ } else {
+ int np, nblocks, np2, l;
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0017-ARM-faster-NEON-IMDCT.patch b/debian/patches/neon/0017-ARM-faster-NEON-IMDCT.patch
new file mode 100644
index 0000000..9720743
--- /dev/null
+++ b/debian/patches/neon/0017-ARM-faster-NEON-IMDCT.patch
@@ -0,0 +1,103 @@
+From d49bcbe5ba20f6130b2dcad078b0ee27b2660a8f Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Fri, 11 Sep 2009 02:01:18 +0000
+Subject: [PATCH 17/27] ARM: faster NEON IMDCT
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19817 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/mdct_neon.S | 46 ++++++++++++++++---------------------------
+ 1 files changed, 17 insertions(+), 29 deletions(-)
+
+diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
+index 6d1dcfd..d84eccd 100644
+--- a/libavcodec/arm/mdct_neon.S
++++ b/libavcodec/arm/mdct_neon.S
+@@ -38,30 +38,28 @@ function ff_imdct_half_neon, export=1
+ mov r12, #-16
+ sub r7, r7, #16
+
+- vld1.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
+- vld1.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
++ vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
++ vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
++ vrev64.32 d17, d17
+ vld1.32 {d2}, [r4,:64]! @ d2=c0,c1
++ vmul.f32 d6, d17, d2
+ vld1.32 {d3}, [r5,:64]! @ d3=s0,s1
+- vuzp.32 d17, d16
+- vuzp.32 d0, d1
+- vmul.f32 d6, d16, d2
+ vmul.f32 d7, d0, d2
+ 1:
+ subs lr, lr, #2
+ ldr r6, [r3], #4
+ vmul.f32 d4, d0, d3
+- vmul.f32 d5, d16, d3
++ vmul.f32 d5, d17, d3
+ vsub.f32 d4, d6, d4
+ vadd.f32 d5, d5, d7
+ uxtah r8, r1, r6, ror #16
+ uxtah r6, r1, r6
+ beq 1f
+- vld1.32 {d16-d17},[r7,:128],r12
+- vld1.32 {d0-d1}, [r2,:128]!
+- vuzp.32 d17, d16
++ vld2.32 {d16-d17},[r7,:128],r12
++ vld2.32 {d0-d1}, [r2,:128]!
++ vrev64.32 d17, d17
+ vld1.32 {d2}, [r4,:64]!
+- vuzp.32 d0, d1
+- vmul.f32 d6, d16, d2
++ vmul.f32 d6, d17, d2
+ vld1.32 {d3}, [r5,:64]!
+ vmul.f32 d7, d0, d2
+ vst2.32 {d4[0],d5[0]}, [r6,:64]
+@@ -95,11 +93,9 @@ function ff_imdct_half_neon, export=1
+ mov r8, r6
+ mov r0, r3
+
+- vld1.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
+- vld1.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
++ vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
++ vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
+ vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
+- vuzp.32 d20, d21
+- vuzp.32 d0, d1
+ 1:
+ subs lr, lr, #2
+ vmul.f32 d7, d0, d18
+@@ -118,25 +114,17 @@ function ff_imdct_half_neon, export=1
+ vsub.f32 d4, d4, d24
+ vsub.f32 d5, d5, d25
+ beq 1f
+- vld1.32 {d0-d1}, [r3,:128], r7
+- vld1.32 {d20-d21},[r6,:128]!
++ vld2.32 {d0-d1}, [r3,:128], r7
++ vld2.32 {d20-d21},[r6,:128]!
+ vld1.32 {d18}, [r2,:64], r12
+- vuzp.32 d20, d21
+- vuzp.32 d0, d1
+ vrev64.32 q3, q3
+- vtrn.32 d4, d6
+- vtrn.32 d5, d7
+- vswp d5, d6
+- vst1.32 {d4-d5}, [r0,:128], r7
+- vst1.32 {d6-d7}, [r8,:128]!
++ vst2.32 {d4,d6}, [r0,:128], r7
++ vst2.32 {d5,d7}, [r8,:128]!
+ b 1b
+ 1:
+ vrev64.32 q3, q3
+- vtrn.32 d4, d6
+- vtrn.32 d5, d7
+- vswp d5, d6
+- vst1.32 {d4-d5}, [r0,:128]
+- vst1.32 {d6-d7}, [r8,:128]
++ vst2.32 {d4,d6}, [r0,:128]
++ vst2.32 {d5,d7}, [r8,:128]
+
+ pop {r4-r8,pc}
+ .endfunc
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0018-Prepare-for-optimised-forward-MDCT-implementations.patch b/debian/patches/neon/0018-Prepare-for-optimised-forward-MDCT-implementations.patch
new file mode 100644
index 0000000..e5c265b
--- /dev/null
+++ b/debian/patches/neon/0018-Prepare-for-optimised-forward-MDCT-implementations.patch
@@ -0,0 +1,86 @@
+From 696f65e4e5d92a8777d922269558cc9f03ca1145 Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Fri, 11 Sep 2009 02:24:19 +0000
+Subject: [PATCH 18/27] Prepare for optimised forward MDCT implementations
+
+This adds a function pointer for forward MDCT to FFTContext and
+initialises it with the existing C function. ff_calc_mdct() is
+changed to an inline function calling the selected version as
+done for other fft/mdct functions.
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19818 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/dsputil.h | 9 ++++++++-
+ libavcodec/fft.c | 1 +
+ libavcodec/mdct.c | 2 +-
+ 3 files changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
+index c4abd14..e299ade 100644
+--- a/libavcodec/dsputil.h
++++ b/libavcodec/dsputil.h
+@@ -680,6 +680,7 @@ typedef struct FFTContext {
+ void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
+ void (*imdct_calc)(struct MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void (*imdct_half)(struct MDCTContext *s, FFTSample *output, const FFTSample *input);
++ void (*mdct_calc)(struct MDCTContext *s, FFTSample *output, const FFTSample *input);
+ } FFTContext;
+
+ extern FFTSample* ff_cos_tabs[13];
+@@ -737,6 +738,12 @@ static inline void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSam
+ s->fft.imdct_half(s, output, input);
+ }
+
++static inline void ff_mdct_calc(MDCTContext *s, FFTSample *output,
++ const FFTSample *input)
++{
++ s->fft.mdct_calc(s, output, input);
++}
++
+ /**
+ * Generate a Kaiser-Bessel Derived Window.
+ * @param window pointer to half window
+@@ -762,6 +769,7 @@ extern float *ff_sine_windows[6];
+ int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
+ void ff_imdct_calc_c(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_imdct_half_c(MDCTContext *s, FFTSample *output, const FFTSample *input);
++void ff_mdct_calc_c(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_imdct_calc_3dn(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_imdct_half_3dn(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
+@@ -770,7 +778,6 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input
+ void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_imdct_calc_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_imdct_half_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
+-void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input);
+ void ff_mdct_end(MDCTContext *s);
+
+ /* Real Discrete Fourier Transform */
+diff --git a/libavcodec/fft.c b/libavcodec/fft.c
+index b05ddda..655acd3 100644
+--- a/libavcodec/fft.c
++++ b/libavcodec/fft.c
+@@ -86,6 +86,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
+ s->fft_calc = ff_fft_calc_c;
+ s->imdct_calc = ff_imdct_calc_c;
+ s->imdct_half = ff_imdct_half_c;
++ s->mdct_calc = ff_mdct_calc_c;
+ s->exptab1 = NULL;
+
+ #if HAVE_MMX && HAVE_YASM
+diff --git a/libavcodec/mdct.c b/libavcodec/mdct.c
+index cb3388f..b0816b0 100644
+--- a/libavcodec/mdct.c
++++ b/libavcodec/mdct.c
+@@ -179,7 +179,7 @@ void ff_imdct_calc_c(MDCTContext *s, FFTSample *output, const FFTSample *input)
+ * @param input N samples
+ * @param out N/2 samples
+ */
+-void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input)
++void ff_mdct_calc_c(MDCTContext *s, FFTSample *out, const FFTSample *input)
+ {
+ int i, j, n, n8, n4, n2, n3;
+ FFTSample re, im;
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0019-ARM-NEON-optimised-MDCT.patch b/debian/patches/neon/0019-ARM-NEON-optimised-MDCT.patch
new file mode 100644
index 0000000..2fc8925
--- /dev/null
+++ b/debian/patches/neon/0019-ARM-NEON-optimised-MDCT.patch
@@ -0,0 +1,207 @@
+From 2b3356ebb1f7b4b69d922e0bdfc9f5e631cf4793 Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Fri, 11 Sep 2009 02:24:22 +0000
+Subject: [PATCH 19/27] ARM: NEON optimised MDCT
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19819 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/mdct_neon.S | 160 ++++++++++++++++++++++++++++++++++++++++++++
+ libavcodec/dsputil.h | 1 +
+ libavcodec/fft.c | 1 +
+ 3 files changed, 162 insertions(+), 0 deletions(-)
+
+diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
+index d84eccd..26ac199 100644
+--- a/libavcodec/arm/mdct_neon.S
++++ b/libavcodec/arm/mdct_neon.S
+@@ -164,3 +164,163 @@ function ff_imdct_calc_neon, export=1
+
+ pop {r4-r6,pc}
+ .endfunc
++
++function ff_mdct_calc_neon, export=1
++ push {r4-r10,lr}
++
++ mov r12, #1
++ ldr lr, [r0, #4] @ nbits
++ ldr r4, [r0, #8] @ tcos
++ ldr r5, [r0, #12] @ tsin
++ ldr r3, [r0, #24] @ revtab
++ lsl lr, r12, lr @ n = 1 << nbits
++ add r7, r2, lr @ in4u
++ sub r9, r7, #16 @ in4d
++ add r2, r7, lr, lsl #1 @ in3u
++ add r8, r9, lr, lsl #1 @ in3d
++ mov r12, #-16
++
++ vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
++ vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
++ vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
++ vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
++ vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
++ vsub.f32 d20, d18, d20 @ in4d-in4u I
++ vld1.32 {d2}, [r4,:64]! @ c0,c1
++ vadd.f32 d0, d0, d19 @ in3u+in3d -R
++ vld1.32 {d3}, [r5,:64]! @ s0,s1
++1:
++ vmul.f32 d7, d20, d3 @ I*s
++ vmul.f32 d6, d0, d2 @ -R*c
++ ldr r6, [r3], #4
++ vmul.f32 d4, d0, d3 @ -R*s
++ vmul.f32 d5, d20, d2 @ I*c
++ subs lr, lr, #16
++ vsub.f32 d6, d6, d7 @ -R*c-I*s
++ vadd.f32 d7, d4, d5 @ -R*s+I*c
++ uxtah r10, r1, r6, ror #16
++ uxtah r6, r1, r6
++ beq 1f
++ vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
++ vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
++ vneg.f32 d7, d7 @ R*s-I*c
++ vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
++ vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
++ vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
++ vsub.f32 d20, d18, d20 @ in4d-in4u I
++ vld1.32 {d2}, [r4,:64]! @ c0,c1
++ vadd.f32 d0, d0, d19 @ in3u+in3d -R
++ vld1.32 {d3}, [r5,:64]! @ s0,s1
++ vst2.32 {d6[0],d7[0]}, [r6,:64]
++ vst2.32 {d6[1],d7[1]}, [r10,:64]
++ b 1b
++1:
++ vneg.f32 d7, d7 @ R*s-I*c
++ vst2.32 {d6[0],d7[0]}, [r6,:64]
++ vst2.32 {d6[1],d7[1]}, [r10,:64]
++
++ mov r12, #1
++ ldr lr, [r0, #4] @ nbits
++ lsl lr, r12, lr @ n = 1 << nbits
++ sub r8, r2, #16 @ in1d
++ add r2, r9, #16 @ in0u
++ sub r9, r7, #16 @ in2d
++ mov r12, #-16
++
++ vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
++ vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
++ vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
++ vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1
++ vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
++ vsub.f32 d0, d0, d18 @ in0u-in2d R
++ vld1.32 {d2}, [r4,:64]! @ c0,c1
++ vadd.f32 d20, d20, d19 @ in2u+in1d -I
++ vld1.32 {d3}, [r5,:64]! @ s0,s1
++1:
++ vmul.f32 d6, d0, d2 @ R*c
++ vmul.f32 d7, d20, d3 @ -I*s
++ ldr r6, [r3], #4
++ vmul.f32 d4, d0, d3 @ R*s
++ vmul.f32 d5, d20, d2 @ I*c
++ subs lr, lr, #16
++ vsub.f32 d6, d7, d6 @ I*s-R*c
++ vadd.f32 d7, d4, d5 @ R*s-I*c
++ uxtah r10, r1, r6, ror #16
++ uxtah r6, r1, r6
++ beq 1f
++ vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
++ vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
++ vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
++ vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1
++ vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
++ vsub.f32 d0, d0, d18 @ in0u-in2d R
++ vld1.32 {d2}, [r4,:64]! @ c0,c1
++ vadd.f32 d20, d20, d19 @ in2u+in1d -I
++ vld1.32 {d3}, [r5,:64]! @ s0,s1
++ vst2.32 {d6[0],d7[0]}, [r6,:64]
++ vst2.32 {d6[1],d7[1]}, [r10,:64]
++ b 1b
++1:
++ vst2.32 {d6[0],d7[0]}, [r6,:64]
++ vst2.32 {d6[1],d7[1]}, [r10,:64]
++
++ mov r4, r0
++ mov r6, r1
++ add r0, r0, #16
++ bl ff_fft_calc_neon
++
++ mov r12, #1
++ ldr lr, [r4, #4] @ nbits
++ ldr r5, [r4, #12] @ tsin
++ ldr r4, [r4, #8] @ tcos
++ lsl r12, r12, lr @ n = 1 << nbits
++ lsr lr, r12, #3 @ n8 = n >> 3
++
++ add r4, r4, lr, lsl #2
++ add r5, r5, lr, lsl #2
++ add r6, r6, lr, lsl #3
++ sub r1, r4, #8
++ sub r2, r5, #8
++ sub r3, r6, #16
++
++ mov r7, #-16
++ mov r12, #-8
++ mov r8, r6
++ mov r0, r3
++
++ vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
++ vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
++ vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
++1:
++ subs lr, lr, #2
++ vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
++ vld1.32 {d19}, [r5,:64]! @ s2,s3
++ vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
++ vld1.32 {d16}, [r1,:64], r12 @ c1,c0
++ vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
++ vld1.32 {d17}, [r4,:64]! @ c2,c3
++ vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
++ vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
++ vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
++ vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
++ vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
++ vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
++ vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
++ vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
++ vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
++ vneg.f32 q2, q2
++ beq 1f
++ vld2.32 {d0-d1}, [r3,:128], r7
++ vld2.32 {d20-d21},[r6,:128]!
++ vld1.32 {d18}, [r2,:64], r12
++ vrev64.32 q3, q3
++ vst2.32 {d4,d6}, [r0,:128], r7
++ vst2.32 {d5,d7}, [r8,:128]!
++ b 1b
++1:
++ vrev64.32 q3, q3
++ vst2.32 {d4,d6}, [r0,:128]
++ vst2.32 {d5,d7}, [r8,:128]
++
++ pop {r4-r10,pc}
++.endfunc
+diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
+index e299ade..f89013c 100644
+--- a/libavcodec/dsputil.h
++++ b/libavcodec/dsputil.h
+@@ -778,6 +778,7 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input
+ void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_imdct_calc_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_imdct_half_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
++void ff_mdct_calc_neon(MDCTContext *s, FFTSample *output, const FFTSample *input);
+ void ff_mdct_end(MDCTContext *s);
+
+ /* Real Discrete Fourier Transform */
+diff --git a/libavcodec/fft.c b/libavcodec/fft.c
+index 655acd3..69feb44 100644
+--- a/libavcodec/fft.c
++++ b/libavcodec/fft.c
+@@ -119,6 +119,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
+ s->fft_calc = ff_fft_calc_neon;
+ s->imdct_calc = ff_imdct_calc_neon;
+ s->imdct_half = ff_imdct_half_neon;
++ s->mdct_calc = ff_mdct_calc_neon;
+ revtab_shift = 3;
+ #endif
+
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0020-ARM-remove-unnecessary-.fpu-neon-directives.patch b/debian/patches/neon/0020-ARM-remove-unnecessary-.fpu-neon-directives.patch
new file mode 100644
index 0000000..c86ea9a
--- /dev/null
+++ b/debian/patches/neon/0020-ARM-remove-unnecessary-.fpu-neon-directives.patch
@@ -0,0 +1,80 @@
+From cf79e9ce120293a3d8fd6887298283c27ee866d4 Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Fri, 2 Oct 2009 19:35:12 +0000
+Subject: [PATCH 20/20] ARM: remove unnecessary .fpu neon directives
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@20151 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ libavcodec/arm/dsputil_neon_s.S | 1 -
+ libavcodec/arm/h264dsp_neon.S | 2 --
+ libavcodec/arm/h264idct_neon.S | 2 --
+ libavcodec/arm/mdct_neon.S | 1 -
+ libavcodec/arm/simple_idct_neon.S | 2 --
+ 5 files changed, 0 insertions(+), 8 deletions(-)
+
+diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
+index 71d09c6..f039349 100644
+--- a/libavcodec/arm/dsputil_neon_s.S
++++ b/libavcodec/arm/dsputil_neon_s.S
+@@ -23,7 +23,6 @@
+ #include "asm.S"
+
+ preserve8
+- .fpu neon
+ .text
+
+ .macro pixels16 avg=0
+diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
+index 03e21f1..78f312d 100644
+--- a/libavcodec/arm/h264dsp_neon.S
++++ b/libavcodec/arm/h264dsp_neon.S
+@@ -20,8 +20,6 @@
+
+ #include "asm.S"
+
+- .fpu neon
+-
+ .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
+ vtrn.32 \r0, \r4
+ vtrn.32 \r1, \r5
+diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
+index 6527390..d30e703 100644
+--- a/libavcodec/arm/h264idct_neon.S
++++ b/libavcodec/arm/h264idct_neon.S
+@@ -21,8 +21,6 @@
+ #include "asm.S"
+
+ preserve8
+- .fpu neon
+-
+ .text
+
+ function ff_h264_idct_add_neon, export=1
+diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
+index 5cd4647..f089519 100644
+--- a/libavcodec/arm/mdct_neon.S
++++ b/libavcodec/arm/mdct_neon.S
+@@ -21,7 +21,6 @@
+
+ #include "asm.S"
+
+- .fpu neon
+ .text
+
+ function ff_imdct_half_neon, export=1
+diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
+index 0882481..4c329d8 100644
+--- a/libavcodec/arm/simple_idct_neon.S
++++ b/libavcodec/arm/simple_idct_neon.S
+@@ -45,8 +45,6 @@
+ #define w7 d1[2]
+ #define w4c d1[3]
+
+- .fpu neon
+-
+ .macro idct_col4_top
+ vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */
+ vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */
+--
+1.6.3.3
+
diff --git a/debian/patches/neon/0021-ARM-check-for-VFP-register-arguments.patch b/debian/patches/neon/0021-ARM-check-for-VFP-register-arguments.patch
new file mode 100644
index 0000000..7e47d32
--- /dev/null
+++ b/debian/patches/neon/0021-ARM-check-for-VFP-register-arguments.patch
@@ -0,0 +1,55 @@
+From 2062a3d147ea6123eb0aa2b010be6c662e58f291 Mon Sep 17 00:00:00 2001
+From: mru <mru at 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b>
+Date: Mon, 20 Jul 2009 22:30:24 +0000
+Subject: [PATCH] ARM: check for VFP register arguments
+
+git-svn-id: file:///var/local/repositories/ffmpeg/trunk@19474 9553f0bf-9b14-0410-a0b8-cfaf0461ba5b
+---
+ configure | 6 ++++++
+ libavcodec/arm/asm.S | 9 +++++++++
+ 2 files changed, 15 insertions(+), 0 deletions(-)
+
+diff --git a/configure b/configure
+index 7459215..d0c7096 100755
+--- a/configure
++++ b/configure
+@@ -917,6 +917,7 @@ HAVE_LIST="
+ termios_h
+ threads
+ truncf
++ vfp_args
+ winsock2_h
+ yasm
+ "
+@@ -1866,6 +1867,11 @@ fi
+
+ # check for assembler specific support
+
++enabled arm && check_ld <<EOF && enable vfp_args
++__asm__ (".eabi_attribute 28, 1");
++int main(void) { return 0; }
++EOF
++
+ enabled ppc && check_asm dcbzl '"dcbzl 0, 1"'
+ enabled ppc && check_asm ppc4xx '"maclhw r10, r11, r12"'
+
+diff --git a/libavcodec/arm/asm.S b/libavcodec/arm/asm.S
+index 69d8348..087b279 100644
+--- a/libavcodec/arm/asm.S
++++ b/libavcodec/arm/asm.S
+@@ -45,3 +45,12 @@
+ ldr \rd, =\val
+ #endif
+ .endm
++
++#if HAVE_VFP_ARGS
++ .eabi_attribute 28, 1
++# define VFP
++# define NOVFP @
++#else
++# define VFP @
++# define NOVFP
++#endif
+--
+1.6.3.3
+
diff --git a/debian/patches/series b/debian/patches/series
index 352acfa..450c1c5 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -10,3 +10,26 @@ issue1245.patch
# fpic-ftbfs-fix.patch
# fpic-libpostproc-fix.patch
# fpic-libswscale-fix.patch
+
+# NEON backport
+neon/0001-ARM-NEON-optimised-add_pixels_clamped.patch
+neon/0002-ARM-NEON-optimized-put_signed_pixels_clamped.patch
+neon/0003-Add-guaranteed-alignment-for-loading-dest-pixels-in-.patch
+neon/0004-Reorganise-intreadwrite.h.patch
+neon/0005-ARM-asm-for-AV_RN.patch
+neon/0006-ARM-NEON-put_pixels_clamped.patch
+neon/0007-ARM-Use-fewer-register-in-NEON-put_pixels-_y2-and-_x.patch
+neon/0008-ARM-NEON-VP3-Loop-Filter.patch
+neon/0009-ARM-actually-add-VP3-loop-filter.patch
+neon/0010-ARM-add-some-PLD-in-NEON-IDCT.patch
+neon/0011-ARM-slightly-faster-NEON-H264-horizontal-loop-filter.patch
+neon/0012-ARM-enable-fast_unaligned-when-cpu-armv-67-is-specif.patch
+neon/0013-ARM-NEON-VP3-IDCT.patch
+neon/0014-ARM-NEON-optimised-vorbis_inverse_coupling.patch
+neon/0015-ARM-handle-VFP-register-arguments-in-ff_vector_fmul_.patch
+neon/0016-ARM-NEON-optimised-FFT-and-MDCT.patch
+neon/0017-ARM-faster-NEON-IMDCT.patch
+neon/0018-Prepare-for-optimised-forward-MDCT-implementations.patch
+neon/0019-ARM-NEON-optimised-MDCT.patch
+neon/0020-ARM-remove-unnecessary-.fpu-neon-directives.patch
+neon/0021-ARM-check-for-VFP-register-arguments.patch
--
FFmpeg packaging
More information about the pkg-multimedia-commits
mailing list