[SCM] FFmpeg packaging branch, upstream, updated. 0e69dce4f29f262d94a4e46014aea5a35fb5e854

Sat Jan 10 15:45:09 UTC 2009

The following commit has been merged in the upstream branch:
commit 0e69dce4f29f262d94a4e46014aea5a35fb5e854
Author: Reinhard Tartler <siretart at tauware.de>
Date:   Sat Jan 10 16:44:07 2009 +0100

    Imported Upstream version 0.svn20090110

diff --git a/Changelog b/Changelog
index 74c9a17..d785492 100644
--- a/Changelog
+++ b/Changelog
@@ -140,6 +140,8 @@ version <next>
 - liba52 wrapper removed
 - Speex decoding via libspeex
 - Electronic Arts TGQ decoder
+- RV30 and RV40 decoder
+- QCELP / PureVoice decoder
 
 version 0.4.9-pre1:
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 5e421d6..b0e9630 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -161,6 +161,7 @@ Codecs:
   nuv.c                                 Reimar Doeffinger
   pcx.c                                 Ivo van Poorten
   ptx.c                                 Ivo van Poorten
+  qcelp*                                Reynaldo H. Verdejo Pinochet
   qdm2.c, qdm2data.h                    Roberto Togni
   qdrw.c                                Kostya Shishkov
   qpeg.c                                Kostya Shishkov
@@ -288,6 +289,9 @@ Muxers/Demuxers:
   westwood.c                            Mike Melanson
   wv.c                                  Kostya Shishkov
 
+Protocols:
+  udp.c                                 Luca Abeni
+
 
 Operating systems / CPU architectures
 =====================================
@@ -295,7 +299,6 @@ Operating systems / CPU architectures
 Alpha                                   Mans Rullgard, Falk Hueffner
 ARM                                     Mans Rullgard
 BeOS                                    Francois Revol
-i386                                    Michael Niedermayer
 Mac OS X / PowerPC                      Romain Dolbeau, Guillaume Poirier
 Amiga / PowerPC                         Colin Ward
 Linux / PowerPC                         Luca Barbato
@@ -303,3 +306,14 @@ Windows MinGW                           Alex Beregszaszi, Ramiro Polla
 Windows Cygwin                          Victor Paesa
 ADI/Blackfin DSP                        Marc Hoffman
 Sparc                                   Roman Shaposhnik
+x86                                     Michael Niedermayer
+
+
+GnuPG Fingerprints of maintainers and others who have svn write access
+======================================================================
+
+Benoit Fouet                  B22A 4F4F 43EF 636B BB66 FCDC 0023 AE1E 2985 49C8
+Michael Niedermayer           9FF2 128B 147E F673 0BAD F133 611E C787 040B 0FAB
+Reimar Döffinger              C61D 16E5 9E2C D10C 8958 38A4 0899 A2B9 06D4 D9C7
+Reynaldo H. Verdejo Pinochet  6E27 CD34 170C C78E 4D4F 5F40 C18E 077F 3114 452A
+Sascha Sommer                 38A0 F88B 868E 9D3A 97D4 D6A0 E823 706F 1E07 0D3C
diff --git a/Makefile b/Makefile
index ad3daf0..00b275d 100644
--- a/Makefile
+++ b/Makefile
@@ -299,14 +299,14 @@ $(CODEC_TESTS) $(LAVF_TESTS): regtest-ref
 regtest-ref: ffmpeg$(EXESUF) tests/vsynth1/00.pgm tests/vsynth2/00.pgm tests/asynth1.sw
 
 $(CODEC_TESTS) regtest-ref: tests/tiny_psnr$(EXESUF)
-	$(SRC_PATH)/tests/regression.sh $@ vsynth   tests/vsynth1 a
-	$(SRC_PATH)/tests/regression.sh $@ rotozoom tests/vsynth2 a
+	$(SRC_PATH)/tests/regression.sh $@ vsynth   tests/vsynth1 a "$(TARGET_EXEC)" "$(TARGET_PATH)"
+	$(SRC_PATH)/tests/regression.sh $@ rotozoom tests/vsynth2 a "$(TARGET_EXEC)" "$(TARGET_PATH)"
 
 $(LAVF_TESTS):
-	$(SRC_PATH)/tests/regression.sh $@ lavf tests/vsynth1 b
+	$(SRC_PATH)/tests/regression.sh $@ lavf tests/vsynth1 b "$(TARGET_EXEC)" "$(TARGET_PATH)"
 
 seektest: codectest libavtest tests/seek_test$(EXESUF)
-	$(SRC_PATH)/tests/seek_test.sh $(SEEK_REFFILE)
+	$(SRC_PATH)/tests/seek_test.sh $(SEEK_REFFILE) "$(TARGET_EXEC)" "$(TARGET_PATH)"
 
 servertest: ffserver$(EXESUF) tests/vsynth1/00.pgm tests/asynth1.sw
 	@echo
@@ -326,8 +326,8 @@ tests/vsynth2/00.pgm: tests/rotozoom$(EXESUF)
 tests/asynth1.sw: tests/audiogen$(EXESUF)
 	$(BUILD_ROOT)/$< $@
 
-%$(EXESUF): %.c
-	$(CC) $(FF_LDFLAGS) $(CFLAGS) -o $@ $<
+tests/%$(EXESUF): tests/%.c
+	$(HOSTCC) $(HOSTCFLAGS) $(HOSTLDFLAGS) -o $@ $< $(HOSTLIBS)
 
 tests/seek_test$(EXESUF): tests/seek_test.c $(FF_DEP_LIBS)
 	$(CC) $(FF_LDFLAGS) $(CFLAGS) -o $@ $< $(FF_EXTRALIBS)
diff --git a/README b/README
index 6274ea2..404c33b 100644
--- a/README
+++ b/README
@@ -17,7 +17,7 @@ FFmpeg README
   License, see the file COPYING.GPL for details. Their compilation and use
   in FFmpeg is optional.
 
-* The file libavcodec/i386/idct_mmx.c is distributed under the GNU General
+* The file libavcodec/x86/idct_mmx.c is distributed under the GNU General
   Public License. It is strictly an optimization and its use is optional.
 
 * The file libavcodec/ac3dec.c is distributed under the GNU General Public
diff --git a/cmdutils.c b/cmdutils.c
index f52f56e..32ffefa 100644
--- a/cmdutils.c
+++ b/cmdutils.c
@@ -127,10 +127,10 @@ void parse_options(int argc, char **argv, const OptionDef *options,
         opt = argv[optindex++];
 
         if (handleoptions && opt[0] == '-' && opt[1] != '\0') {
-          if (opt[1] == '-' && opt[2] == '\0') {
-            handleoptions = 0;
-            continue;
-          }
+            if (opt[1] == '-' && opt[2] == '\0') {
+                handleoptions = 0;
+                continue;
+            }
             po= find_option(options, opt + 1);
             if (!po->name)
                 po= find_option(options, "default");
@@ -176,25 +176,30 @@ unknown_opt:
 
 int opt_default(const char *opt, const char *arg){
     int type;
+    int ret= 0;
     const AVOption *o= NULL;
     int opt_types[]={AV_OPT_FLAG_VIDEO_PARAM, AV_OPT_FLAG_AUDIO_PARAM, 0, AV_OPT_FLAG_SUBTITLE_PARAM, 0};
 
-    for(type=0; type<CODEC_TYPE_NB; type++){
+    for(type=0; type<CODEC_TYPE_NB && ret>= 0; type++){
         const AVOption *o2 = av_find_opt(avctx_opts[0], opt, NULL, opt_types[type], opt_types[type]);
         if(o2)
-            o = av_set_string2(avctx_opts[type], opt, arg, 1);
+            ret = av_set_string3(avctx_opts[type], opt, arg, 1, &o);
     }
     if(!o)
-        o = av_set_string2(avformat_opts, opt, arg, 1);
+        ret = av_set_string3(avformat_opts, opt, arg, 1, &o);
     if(!o)
-        o = av_set_string2(sws_opts, opt, arg, 1);
+        ret = av_set_string3(sws_opts, opt, arg, 1, &o);
     if(!o){
         if(opt[0] == 'a')
-            o = av_set_string2(avctx_opts[CODEC_TYPE_AUDIO], opt+1, arg, 1);
+            ret = av_set_string3(avctx_opts[CODEC_TYPE_AUDIO], opt+1, arg, 1, &o);
         else if(opt[0] == 'v')
-            o = av_set_string2(avctx_opts[CODEC_TYPE_VIDEO], opt+1, arg, 1);
+            ret = av_set_string3(avctx_opts[CODEC_TYPE_VIDEO], opt+1, arg, 1, &o);
         else if(opt[0] == 's')
-            o = av_set_string2(avctx_opts[CODEC_TYPE_SUBTITLE], opt+1, arg, 1);
+            ret = av_set_string3(avctx_opts[CODEC_TYPE_SUBTITLE], opt+1, arg, 1, &o);
+    }
+    if (o && ret < 0) {
+        fprintf(stderr, "Invalid value '%s' for option '%s'\n", arg, opt);
+        exit(1);
     }
     if(!o)
         return -1;
@@ -219,7 +224,7 @@ void set_context_opts(void *ctx, void *opts_ctx, int flags)
         const char *str= av_get_string(opts_ctx, opt_names[i], &opt, buf, sizeof(buf));
         /* if an option with name opt_names[i] is present in opts_ctx then str is non-NULL */
         if(str && ((opt->flags & flags) == flags))
-            av_set_string2(ctx, opt_names[i], str, 1);
+            av_set_string3(ctx, opt_names[i], str, 1, NULL);
     }
 }
 
@@ -287,7 +292,7 @@ static void print_all_lib_versions(FILE* outstream, int indent)
 
 void show_banner(void)
 {
-    fprintf(stderr, "%s version " FFMPEG_VERSION ", Copyright (c) %d-2008 Fabrice Bellard, et al.\n",
+    fprintf(stderr, "%s version " FFMPEG_VERSION ", Copyright (c) %d-2009 Fabrice Bellard, et al.\n",
             program_name, program_birth_year);
     fprintf(stderr, "  configuration: " FFMPEG_CONFIGURATION "\n");
     print_all_lib_versions(stderr, 1);
diff --git a/common.mak b/common.mak
index d4c1f14..8ac6ee0 100644
--- a/common.mak
+++ b/common.mak
@@ -91,10 +91,10 @@ $(SUBDIR)%-test.o: $(SUBDIR)%.c
 $(SUBDIR)%-test.o: $(SUBDIR)%-test.c
 	$(CC) $(CFLAGS) -DTEST -c -o $$@ $$^
 
-$(SUBDIR)i386/%.o: $(SUBDIR)i386/%.asm
+$(SUBDIR)x86/%.o: $(SUBDIR)x86/%.asm
 	$(YASM) $(YASMFLAGS) -I $$(<D)/ -o $$@ $$<
 
-$(SUBDIR)i386/%.d: $(SUBDIR)i386/%.asm
+$(SUBDIR)x86/%.d: $(SUBDIR)x86/%.asm
 	$(YASM) $(YASMFLAGS) -I $$(<D)/ -M -o $$(@:%.d=%.o) $$< > $$@
 
 clean::
diff --git a/configure b/configure
index d633444..56606aa 100755
--- a/configure
+++ b/configure
@@ -68,111 +68,131 @@ show_help(){
   echo "  --shlibdir=DIR           install shared libs in DIR [PREFIX/lib]"
   echo "  --incdir=DIR             install includes in DIR [PREFIX/include]"
   echo "  --mandir=DIR             install man page in DIR [PREFIX/share/man]"
-  echo "  --enable-static          build static libraries [default=yes]"
-  echo "  --disable-static         do not build static libraries [default=no]"
-  echo "  --enable-shared          build shared libraries [default=no]"
-  echo "  --disable-shared         do not build shared libraries [default=yes]"
-  echo "  --enable-gpl             allow use of GPL code, the resulting libav*"
-  echo "                           and ffmpeg will be under GPL [default=no]"
-  echo "  --enable-nonfree         allow use of nonfree code, the resulting libav*"
-  echo "                           and ffmpeg will be unredistributable [default=no]"
-  echo "  --enable-postproc        enable GPLed postprocessing support [default=no]"
-  echo "  --enable-swscale         software scaler support [default=no]"
-  echo "  --enable-avfilter        video filter support (replaces vhook) [default=no]"
-  echo "  --enable-avfilter-lavf   video filters dependant on avformat [default=no]"
-  echo "  --enable-beosthreads     use BeOS threads [default=no]"
-  echo "  --enable-os2threads      use OS/2 threads [default=no]"
-  echo "  --enable-pthreads        use pthreads [default=no]"
-  echo "  --enable-w32threads      use Win32 threads [default=no]"
-  echo "  --enable-x11grab         enable X11 grabbing [default=no]"
+  echo "  --enable-static          build static libraries [yes]"
+  echo "  --disable-static         do not build static libraries [no]"
+  echo "  --enable-shared          build shared libraries [no]"
+  echo "  --disable-shared         do not build shared libraries [yes]"
+  echo "  --enable-gpl             allow use of GPL code, the resulting libs"
+  echo "                           and binaries will be under GPL [no]"
+  echo "  --enable-nonfree         allow use of nonfree code, the resulting libs"
+  echo "                           and binaries will be unredistributable [no]"
+  echo "  --enable-postproc        enable GPLed postprocessing support [no]"
+  echo "  --enable-swscale         enable GPLed software scaler support [no]"
+  echo "  --enable-avfilter        video filter support (replaces vhook) [no]"
+  echo "  --enable-avfilter-lavf   video filters dependent on avformat [no]"
+  echo "  --disable-vhook          disable video hooking support"
+  echo "  --enable-beosthreads     use BeOS threads [no]"
+  echo "  --enable-os2threads      use OS/2 threads [no]"
+  echo "  --enable-pthreads        use pthreads [no]"
+  echo "  --enable-vdpau           enable VDPAU support [no]"
+  echo "  --enable-w32threads      use Win32 threads [no]"
+  echo "  --enable-x11grab         enable X11 grabbing [no]"
+  echo "  --enable-xvmc            enable XvMC support [no]"
   echo
   echo "External library support:"
-  echo "  --enable-mlib            use Sun medialib [default=no]"
-  echo "  --enable-avisynth        allow reading AVISynth script files [default=no]"
-  echo "  --enable-libamr-nb       enable libamr-nb floating point audio codec"
-  echo "  --enable-libamr-wb       enable libamr-wb floating point audio codec"
+  echo "  --enable-mlib            enable Sun medialib [no]"
+  echo "  --enable-avisynth        enable reading of AVISynth script files [no]"
+  echo "  --enable-bzlib           enable bzlib [autodetect]"
+  echo "  --enable-libamr-nb       enable libamr-nb floating point audio codec [no]"
+  echo "  --enable-libamr-wb       enable libamr-wb floating point audio codec [no]"
   echo "  --enable-libdc1394       enable IIDC-1394 grabbing using libdc1394"
-  echo "                           and libraw1394 [default=no]"
-  echo "  --enable-libdirac        enable Dirac support via libdirac [default=no]"
-  echo "  --enable-libfaac         enable FAAC support via libfaac [default=no]"
-  echo "  --enable-libfaad         enable FAAD support via libfaad [default=no]"
-  echo "  --enable-libfaadbin      open libfaad.so.0 at runtime [default=no]"
-  echo "  --enable-libgsm          enable GSM support via libgsm [default=no]"
-  echo "  --enable-libmp3lame      enable MP3 encoding via libmp3lame [default=no]"
+  echo "                           and libraw1394 [no]"
+  echo "  --enable-libdirac        enable Dirac support via libdirac [no]"
+  echo "  --enable-libfaac         enable FAAC support via libfaac [no]"
+  echo "  --enable-libfaad         enable FAAD support via libfaad [no]"
+  echo "  --enable-libfaadbin      open libfaad.so.0 at runtime [no]"
+  echo "  --enable-libgsm          enable GSM support via libgsm [no]"
+  echo "  --enable-libmp3lame      enable MP3 encoding via libmp3lame [no]"
   echo "  --enable-libnut          enable NUT (de)muxing via libnut,"
-  echo "                           native demuxer exists [default=no]"
-  echo "  --enable-libschroedinger enable Dirac support via libschroedinger [default=no]"
-  echo "  --enable-libspeex        enable Speex decoding via libspeex [default=no]"
-  echo "  --enable-libtheora       enable Theora encoding via libtheora [default=no]"
+  echo "                           native (de)muxer exists [no]"
+  echo "  --enable-libschroedinger enable Dirac support via libschroedinger [no]"
+  echo "  --enable-libspeex        enable Speex decoding via libspeex [no]"
+  echo "  --enable-libtheora       enable Theora encoding via libtheora [no]"
   echo "  --enable-libvorbis       enable Vorbis encoding via libvorbis,"
-  echo "                           native implementation exists [default=no]"
-  echo "  --enable-libx264         enable H.264 encoding via x264 [default=no]"
+  echo "                           native implementation exists [no]"
+  echo "  --enable-libx264         enable H.264 encoding via x264 [no]"
   echo "  --enable-libxvid         enable Xvid encoding via xvidcore,"
-  echo "                           native MPEG-4/Xvid encoder exists [default=no]"
+  echo "                           native MPEG-4/Xvid encoder exists [no]"
+  echo "  --enable-zlib            enable zlib [autodetect]"
   echo ""
   echo "Advanced options (experts only):"
   echo "  --source-path=PATH       path to source code [$source_path]"
   echo "  --cross-prefix=PREFIX    use PREFIX for compilation tools [$cross_prefix]"
   echo "  --enable-cross-compile   assume a cross-compiler is used"
   echo "  --target-os=OS           compiler targets OS [$target_os]"
+  echo "  --target-exec=CMD        command to run executables on target"
+  echo "  --target-path=DIR        path to view of build directory on target"
+  echo "  --nm=NM                  use nm tool"
   echo "  --cc=CC                  use C compiler CC [$cc]"
+  echo "  --host-cc=HOSTCC         use host C compiler HOSTCC"
+  echo "  --host-cflags=HCFLAGS    use HCFLAGS when compiling for host"
+  echo "  --host-ldflags=HLDFLAGS  use HLDFLAGS when linking for host"
+  echo "  --host-libs=HLIBS        use libs HLIBS when linking for host"
   echo "  --extra-cflags=ECFLAGS   add ECFLAGS to CFLAGS [$CFLAGS]"
   echo "  --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS [$LDFLAGS]"
   echo "  --extra-libs=ELIBS       add ELIBS [$ELIBS]"
   echo "  --extra-version=STRING   version string suffix []"
-  echo "  --build-suffix=SUFFIX    suffix for application specific build []"
-  echo "  --arch=ARCH              select architecture  [$arch]"
-  echo "  --cpu=CPU                selects the minimum cpu required (affects"
+  echo "  --build-suffix=SUFFIX    library name suffix []"
+  echo "  --arch=ARCH              select architecture [$arch]"
+  echo "  --cpu=CPU                select the minimum required CPU (affects"
   echo "                           instruction selection, may crash on older CPUs)"
   echo "  --enable-powerpc-perf    enable performance report on PPC"
   echo "                           (requires enabling PMC)"
-  echo "  --disable-mmx            disable MMX usage"
-  echo "  --disable-mmx2           disable MMX2 usage"
-  echo "  --disable-ssse3          disable SSSE3 usage"
-  echo "  --disable-armv5te        disable armv5te usage"
-  echo "  --disable-armv6          disable armv6 usage"
-  echo "  --disable-armvfp         disable ARM VFP usage"
-  echo "  --disable-iwmmxt         disable iwmmxt usage"
-  echo "  --disable-altivec        disable AltiVec usage"
-  echo "  --disable-network        disable network support [default=no]"
-  echo "  --disable-ipv6           disable ipv6 support [default=no]"
-  echo "  --disable-zlib           disable zlib [default=no]"
-  echo "  --disable-bzlib          disable bzlib [default=no]"
-  echo "  --disable-vhook          disable video hooking support"
+  echo "  --disable-altivec        disable AltiVec optimizations"
+  echo "  --disable-mmx            disable MMX optimizations"
+  echo "  --disable-mmx2           disable MMX2 optimizations"
+  echo "  --disable-sse            disable SSE optimizations"
+  echo "  --disable-ssse3          disable SSSE3 optimizations"
+  echo "  --disable-armv5te        disable armv5te optimizations"
+  echo "  --disable-armv6          disable armv6 optimizations"
+  echo "  --disable-armvfp         disable ARM VFP optimizations"
+  echo "  --disable-iwmmxt         disable iwmmxt optimizations"
+  echo "  --disable-mmi            disable MMI optimizations"
+  echo "  --disable-neon           disable neon optimizations"
+  echo "  --disable-vis            disable VIS optimizations"
+  echo "  --disable-network        disable network support [no]"
+  echo "  --disable-ipv6           disable IPv6 support [no]"
   echo "  --disable-mpegaudio-hp   faster (but less accurate)"
-  echo "                           MPEG audio decoding [default=no]"
+  echo "                           MPEG audio decoding [no]"
+  echo "  --disable-aandct         disable AAN DCT code"
+  echo "  --disable-fft            disable FFT code"
+  echo "  --disable-golomb         disable Golomb code"
+  echo "  --disable-mdct           disable MDCT code"
   echo "  --enable-gray            enable full grayscale support (slower color)"
   echo "  --disable-ffmpeg         disable ffmpeg build"
-  echo "  --disable-ffserver       disable ffserver build"
   echo "  --disable-ffplay         disable ffplay build"
+  echo "  --disable-ffserver       disable ffserver build"
   echo "  --enable-small           optimize for size instead of speed"
   echo "  --enable-hardcoded-tables use hardcoded tables instead of runtime generation"
   echo "  --enable-memalign-hack   emulate memalign, interferes with memory debuggers"
-  echo "  --disable-encoder=NAME   disables encoder NAME"
-  echo "  --enable-encoder=NAME    enables encoder NAME"
-  echo "  --disable-decoder=NAME   disables decoder NAME"
-  echo "  --enable-decoder=NAME    enables decoder NAME"
-  echo "  --disable-encoders       disables all encoders"
-  echo "  --disable-decoders       disables all decoders"
-  echo "  --disable-muxer=NAME     disables muxer NAME"
-  echo "  --enable-muxer=NAME      enables muxer NAME"
-  echo "  --disable-muxers         disables all muxers"
-  echo "  --disable-demuxer=NAME   disables demuxer NAME"
-  echo "  --enable-demuxer=NAME    enables demuxer NAME"
-  echo "  --disable-demuxers       disables all demuxers"
-  echo "  --enable-parser=NAME     enables parser NAME"
-  echo "  --disable-parser=NAME    disables parser NAME"
-  echo "  --disable-parsers        disables all parsers"
-  echo "  --enable-bsf=NAME        enables bitstream filter NAME"
-  echo "  --disable-bsf=NAME       disables bitstream filter NAME"
-  echo "  --disable-bsfs           disables all bitstream filters"
-  echo "  --enable-protocol=NAME   enables protocol NAME"
-  echo "  --disable-protocol=NAME  disables protocol NAME"
-  echo "  --disable-protocols      disables all protocols"
-  echo "  --disable-devices        disables all devices"
-  echo "  --enable-filter=NAME     enables filter NAME"
-  echo "  --disable-filter=NAME    disables filter NAME"
-  echo "  --disable-filters        disables all filters"
+  echo "  --enable-beos-netserver  enable BeOS netserver"
+  echo "  --disable-encoder=NAME   disable encoder NAME"
+  echo "  --enable-encoder=NAME    enable encoder NAME"
+  echo "  --disable-encoders       disable all encoders"
+  echo "  --disable-decoder=NAME   disable decoder NAME"
+  echo "  --enable-decoder=NAME    enable decoder NAME"
+  echo "  --disable-decoders       disable all decoders"
+  echo "  --disable-muxer=NAME     disable muxer NAME"
+  echo "  --enable-muxer=NAME      enable muxer NAME"
+  echo "  --disable-muxers         disable all muxers"
+  echo "  --disable-demuxer=NAME   disable demuxer NAME"
+  echo "  --enable-demuxer=NAME    enable demuxer NAME"
+  echo "  --disable-demuxers       disable all demuxers"
+  echo "  --enable-parser=NAME     enable parser NAME"
+  echo "  --disable-parser=NAME    disable parser NAME"
+  echo "  --disable-parsers        disable all parsers"
+  echo "  --enable-bsf=NAME        enable bitstream filter NAME"
+  echo "  --disable-bsf=NAME       disable bitstream filter NAME"
+  echo "  --disable-bsfs           disable all bitstream filters"
+  echo "  --enable-protocol=NAME   enable protocol NAME"
+  echo "  --disable-protocol=NAME  disable protocol NAME"
+  echo "  --disable-protocols      disable all protocols"
+  echo "  --disable-indevs         disable input devices"
+  echo "  --disable-outdevs        disable output devices"
+  echo "  --disable-devices        disable all devices"
+  echo "  --enable-filter=NAME     enable filter NAME"
+  echo "  --disable-filter=NAME    disable filter NAME"
+  echo "  --disable-filters        disable all filters"
   echo "  --list-decoders          show all available decoders"
   echo "  --list-encoders          show all available encoders"
   echo "  --list-muxers            show all available muxers"
@@ -353,7 +373,7 @@ disabled_any(){
 
 set_default(){
     for opt; do
-        eval test -z "\$$opt" && eval $opt=\$${opt}_default
+        eval : \${$opt:=\$${opt}_default}
     done
 }
 
@@ -494,7 +514,7 @@ check_asm(){
     asm="$2"
     shift 2
     check_cc "$@" <<EOF && enable $name || disable $name
-int foo(void){ __asm__ volatile($asm); }
+void foo(void){ __asm__ volatile($asm); }
 EOF
 }
 
@@ -714,6 +734,7 @@ COMPONENT_LIST="
 
 CONFIG_LIST="
     $COMPONENT_LIST
+    aandct
     avfilter
     avfilter_lavf
     avisynth
@@ -753,10 +774,14 @@ CONFIG_LIST="
     nonfree
     postproc
     powerpc_perf
+    shared
     small
+    static
     swscale
+    vdpau
     vhook
     x11grab
+    xvmc
     zlib
 "
 
@@ -769,13 +794,13 @@ THREADS_LIST='
 
 ARCH_LIST='
     alpha
-    armv4l
+    arm
     bfin
     ia64
     m68k
     mips
     parisc
-    powerpc
+    ppc
     s390
     sh4
     sparc
@@ -789,12 +814,14 @@ ARCH_EXT_LIST='
     altivec
     armv5te
     armv6
+    armv6t2
     armvfp
     iwmmxt
     mmi
     mmx
     mmx2
     neon
+    sse
     ssse3
     vis
 '
@@ -841,6 +868,7 @@ HAVE_LIST="
     memalign
     mkstemp
     pld
+    posix_memalign
     ppc64
     round
     roundf
@@ -863,6 +891,7 @@ HAVE_LIST="
 # options emitted with CONFIG_ prefix but not available on command line
 CONFIG_EXTRA="
     fft_mmx
+    oldscaler
 "
 
 CMDLINE_SELECT="
@@ -874,8 +903,6 @@ CMDLINE_SELECT="
     extra_warnings
     logging
     optimizations
-    shared
-    static
     stripping
 "
 
@@ -896,29 +923,39 @@ CMDLINE_SET="
     cpu
     cross_prefix
     extra_version
+    host_cc
+    host_cflags
+    host_ldflags
+    host_libs
     logfile
+    nm
     source_path
+    target_exec
     target_os
+    target_path
 "
 
 # code dependency declarations
 
 # architecture extensions
-altivec_deps="powerpc"
-armv5te_deps="armv4l"
-armv6_deps="armv4l"
-armvfp_deps="armv4l"
-iwmmxt_deps="armv4l"
+altivec_deps="ppc"
+armv5te_deps="arm"
+armv6_deps="arm"
+armv6t2_deps="arm"
+armvfp_deps="arm"
+iwmmxt_deps="arm"
 mmi_deps="mips"
 mmx_deps="x86"
 mmx2_deps="x86 mmx"
-neon_deps="armv4l"
-ssse3_deps="x86"
+neon_deps="arm"
+sse_deps="mmx"
+ssse3_deps="x86 sse"
 vis_deps="sparc"
 
 # common features
 fft_suggest="fft_mmx"
 fft_mmx_deps="mmx yasm"
+oldscaler_deps="!swscale"
 
 # decoders / encoders
 aac_decoder_select="fft mdct"
@@ -929,25 +966,42 @@ cavs_decoder_select="golomb"
 cook_decoder_select="fft mdct"
 cscd_decoder_suggest="zlib"
 dca_decoder_select="fft mdct"
+dnxhd_encoder_select="aandct"
 dxa_decoder_select="zlib"
 eac3_decoder_deps="gpl"
 eac3_decoder_select="fft mdct"
+eatgq_decoder_select="aandct"
 ffv1_decoder_select="golomb"
 flac_decoder_select="golomb"
 flac_encoder_select="golomb"
 flashsv_decoder_select="zlib"
 flashsv_encoder_select="zlib"
+flv_encoder_select="aandct"
+h261_encoder_select="aandct"
+h263_encoder_select="aandct"
+h263p_encoder_select="aandct"
 h264_decoder_select="golomb"
+h264_vdpau_decoder_deps="vdpau"
 imc_decoder_select="fft mdct"
 jpegls_decoder_select="golomb"
 jpegls_encoder_select="golomb"
+ljpeg_encoder_select="aandct"
 loco_decoder_select="golomb"
-mpeg_xvmc_decoder_deps="xvmc"
+mjpeg_encoder_select="aandct"
+mpeg1video_encoder_select="aandct"
+mpeg2video_encoder_select="aandct"
+mpeg4_encoder_select="aandct"
+mpeg_xvmc_decoder_deps="xvmc X11_extensions_XvMClib_h"
+msmpeg4v1_encoder_select="aandct"
+msmpeg4v2_encoder_select="aandct"
+msmpeg4v3_encoder_select="aandct"
 nellymoser_decoder_select="fft mdct"
 nellymoser_encoder_select="fft mdct"
 png_decoder_select="zlib"
 png_encoder_select="zlib"
 qdm2_decoder_select="fft mdct"
+rv10_encoder_select="aandct"
+rv20_encoder_select="aandct"
 shorten_decoder_select="golomb"
 sonic_decoder_select="golomb"
 sonic_encoder_select="golomb"
@@ -963,6 +1017,8 @@ wmav1_decoder_select="fft mdct"
 wmav1_encoder_select="fft mdct"
 wmav2_decoder_select="fft mdct"
 wmav2_encoder_select="fft mdct"
+wmv1_encoder_select="aandct"
+wmv2_encoder_select="aandct"
 zlib_decoder_select="zlib"
 zlib_encoder_select="zlib"
 zmbv_decoder_select="zlib"
@@ -991,6 +1047,7 @@ libvorbis_encoder_deps="libvorbis"
 libx264_encoder_deps="libx264"
 libxvid_encoder_deps="libxvid"
 mpeg4aac_decoder_deps="libfaad"
+vdpau_deps="vdpau_vdpau_h vdpau_vdpau_x11_h"
 
 # demuxers / muxers
 ac3_demuxer_deps="ac3_parser"
@@ -1050,15 +1107,15 @@ mandir_default='${prefix}/share/man'
 shlibdir_default="$libdir_default"
 
 # toolchain
-cc_default="gcc"
-yasmexe="yasm"
 ar="ar"
-nm="nm"
-ranlib="ranlib"
-strip="strip"
-asmalign_pot="unknown"
+cc_default="gcc"
+host_cc_default="gcc"
 ln_s="ln -sf"
+nm_default="nm"
 objformat="elf"
+ranlib="ranlib"
+strip="strip"
+yasmexe="yasm"
 
 # machine
 arch=`uname -m`
@@ -1075,6 +1132,7 @@ enable ffserver
 enable ipv6
 enable mpegaudio_hp
 enable network
+enable oldscaler
 enable optimizations
 enable protocols
 enable static
@@ -1097,6 +1155,11 @@ SLIBNAME_WITH_VERSION='$(SLIBNAME).$(LIBVERSION)'
 SLIBNAME_WITH_MAJOR='$(SLIBNAME).$(LIBMAJOR)'
 LIB_INSTALL_EXTRA_CMD='$(RANLIB) "$(LIBDIR)/$(LIBNAME)"'
 
+host_cflags='-O3 -g -Wall'
+host_libs='-lm'
+
+target_path='.'
+
 # gcc stupidly only outputs the basename of targets with -MM, but we need the
 # full relative path for objects in subdirectories for non-recursive Make.
 DEPEND_CMD='$(CC) $(CFLAGS) -MM -MG $< | sed -e "/^\#.*/d" -e "s,^[[:space:]]*$(*F)\\.o,$(@D)/$(*F).o," -e "s,\\([[:space:]]\\)\\(version\\.h\\),\\1\$$(BUILD_ROOT_REL)/\\2,"'
@@ -1211,30 +1274,29 @@ disabled logging && logfile=/dev/null
 echo "# $0 $@" > $logfile
 set >> $logfile
 
-cc_default="${cross_prefix}${cc_default}"
-yasmexe="${cross_prefix}${yasmexe}"
+test -n "$cross_prefix" && enable cross_compile
+
 ar="${cross_prefix}${ar}"
-nm="${cross_prefix}${nm}"
+cc_default="${cross_prefix}${cc_default}"
+nm_default="${cross_prefix}${nm_default}"
 ranlib="${cross_prefix}${ranlib}"
 strip="${cross_prefix}${strip}"
 
-set_default cc
+set_default cc nm
+enabled cross_compile || host_cc_default=$cc
+set_default host_cc
 
 # set temporary file name
-if test ! -z "$TMPDIR" ; then
-    TMPDIR1="${TMPDIR}"
-elif test ! -z "$TEMPDIR" ; then
-    TMPDIR1="${TEMPDIR}"
-else
-    TMPDIR1="/tmp"
-fi
+: ${TMPDIR:=$TEMPDIR}
+: ${TMPDIR:=$TMP}
+: ${TMPDIR:=/tmp}
 
-TMPC="${TMPDIR1}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}.c"
-TMPE="${TMPDIR1}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}${EXESUF}"
-TMPH="${TMPDIR1}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}.h"
-TMPO="${TMPDIR1}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}.o"
-TMPS="${TMPDIR1}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}.S"
-TMPSH="${TMPDIR1}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}.sh"
+TMPC="${TMPDIR}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}.c"
+TMPE="${TMPDIR}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}${EXESUF}"
+TMPH="${TMPDIR}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}.h"
+TMPO="${TMPDIR}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}.o"
+TMPS="${TMPDIR}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}.S"
+TMPSH="${TMPDIR}/ffmpeg-conf-${RANDOM}-$$-${RANDOM}.sh"
 
 check_cflags -std=c99
 
@@ -1252,26 +1314,29 @@ case "$arch" in
         int test[sizeof(char*) - 7];
 EOF
     ;;
-    # armv4l is a subset of armv[567]*l
     arm|armv[4567]*l)
-        arch="armv4l"
+        arch="arm"
     ;;
     alpha)
         arch="alpha"
         enable fast_64bit
     ;;
     "Power Macintosh"|ppc|powerpc)
-        arch="powerpc"
+        arch="ppc"
         enable fast_unaligned
     ;;
     ppc64)
-        arch="powerpc"
+        arch="ppc"
         enable fast_64bit
         enable fast_unaligned
     ;;
     mips|mipsel|IP*)
         arch="mips"
     ;;
+    mips64)
+        arch="mips"
+        enable fast_64bit
+    ;;
     sun4u|sparc64)
         arch="sparc64"
         enable fast_64bit
@@ -1279,13 +1344,13 @@ EOF
     sparc)
         arch="sparc"
     ;;
-    sh4)
+    sh4|sh)
         arch="sh4"
     ;;
-    parisc)
+    parisc|hppa)
         arch="parisc"
     ;;
-    parisc64)
+    parisc64|hppa64)
         arch="parisc"
         enable fast_64bit
     ;;
@@ -1343,6 +1408,7 @@ case $target_os in
         FFSERVERLDFLAGS=""
         SHFLAGS='-shared -Wl,-h,$$(@F)'
         network_extralibs="-lsocket -lnsl"
+        add_cflags -D__EXTENSIONS__
         ;;
     netbsd)
         oss_demuxer_extralibs="-lossaudio"
@@ -1352,9 +1418,6 @@ case $target_os in
         disable need_memalign
         LIBOBJFLAGS='$(PIC)'
         SHFLAGS='-shared'
-        SLIBNAME='$(SLIBPREF)$(FULLNAME)$(SLIBSUF).$(LIBVERSION)'
-        SLIBNAME_WITH_VERSION='$(SLIBNAME)'
-        SLIBNAME_WITH_MAJOR='$(SLIBNAME)'
         oss_demuxer_extralibs="-lossaudio"
         oss_muxer_extralibs="-lossaudio"
         ;;
@@ -1408,8 +1471,8 @@ case $target_os in
         check_cpp_condition _mingw.h "(__MINGW32_MAJOR_VERSION > 3) || (__MINGW32_MAJOR_VERSION == 3 && __MINGW32_MINOR_VERSION >= 15)" ||
             die "ERROR: MinGW runtime version must be >= 3.15."
         enabled_any avisynth vfwcap_demuxer &&
-            { check_cpp_condition w32api.h "(__W32API_MAJOR_VERSION > 3) || (__W32API_MAJOR_VERSION == 3 && __W32API_MINOR_VERSION >= 12)" ||
-              die "ERROR: avisynth and vfwcap_demuxer require w32api version 3.12 or later."; }
+            { check_cpp_condition w32api.h "(__W32API_MAJOR_VERSION > 3) || (__W32API_MAJOR_VERSION == 3 && __W32API_MINOR_VERSION >= 13)" ||
+              die "ERROR: avisynth and vfwcap_demuxer require w32api version 3.13 or later."; }
         ;;
     cygwin*)
         target_os=cygwin
@@ -1485,8 +1548,6 @@ add_extralibs $osextralibs
 # Combine FFLDFLAGS and the LDFLAGS environment variable.
 LDFLAGS="$FFLDFLAGS $LDFLAGS"
 
-test -n "$cross_prefix" && enable cross_compile
-
 # we need to build at least one lib type
 if ! enabled_any static shared; then
     cat <<EOF
@@ -1635,9 +1696,8 @@ EOF
 chmod +x $TMPSH >> $logfile 2>&1
 if ! $TMPSH >> $logfile 2>&1; then
     cat <<EOF
-Unable to create and execute files in $TMPDIR1.  Set the TMPDIR environment
-variable to another directory and make sure that $TMPDIR1 is not mounted
-noexec.
+Unable to create and execute files in $TMPDIR.  Set the TMPDIR environment
+variable to another directory and make sure that it is not mounted noexec.
 EOF
     die "Sanity test failed."
 fi
@@ -1699,7 +1759,7 @@ fi
 
 # check for assembler specific support
 
-if test $arch = "powerpc"; then
+if test $arch = "ppc"; then
     check_asm dcbzl '"dcbzl 0, 1"'
 fi
 
@@ -1729,9 +1789,10 @@ EOF
 fi
 
 # We have to check if pld is a nop and disable it.
-enabled armv4l  && check_asm pld     '"pld [r0]"'
+enabled arm     && check_asm pld     '"pld [r0]"'
 enabled armv5te && check_asm armv5te '"qadd r0, r0, r0"'
 enabled armv6   && check_asm armv6   '"sadd16 r0, r0, r0"'
+enabled armv6t2 && check_asm armv6t2 '"movt r0, #0"'
 enabled armvfp  && check_asm armvfp  '"fadds s0, s0, s0"'
 enabled iwmmxt  && check_asm iwmmxt  '"wunpckelub wr6, wr4"'
 enabled mmi     && check_asm mmi     '"lq $2, 0($2)"'
@@ -1762,6 +1823,7 @@ check_func  getrusage
 check_func  inet_aton $network_extralibs
 check_func  memalign
 check_func  mkstemp
+check_func  posix_memalign
 check_func_headers windows.h GetProcessTimes
 
 check_header conio.h
@@ -1772,13 +1834,16 @@ check_header sys/mman.h
 check_header sys/resource.h
 check_header sys/select.h
 check_header termios.h
+check_header vdpau/vdpau.h
+check_header vdpau/vdpau_x11.h
+check_header X11/extensions/XvMClib.h
 
-if ! enabled_any memalign memalign_hack && enabled need_memalign ; then
-    die "Error, no memalign() but SSE enabled, disable it or use --enable-memalign-hack."
+if ! enabled_any memalign memalign_hack posix_memalign && enabled need_memalign ; then
+    die "Error, no aligned memory allocator but SSE enabled, disable it or use --enable-memalign-hack."
 fi
 
-disabled  zlib || check_lib  zlib.h      zlibVersion -lz   || disable  zlib
-disabled bzlib || check_lib bzlib.h BZ2_bzlibVersion -lbz2 || disable bzlib
+disabled  zlib || check_lib   zlib.h      zlibVersion -lz   || disable  zlib
+disabled bzlib || check_lib2 bzlib.h BZ2_bzlibVersion -lbz2 || disable bzlib
 
 # check for some common methods of building with pthread support
 # do this before the optional library checks as some of them require pthreads
@@ -1826,17 +1891,17 @@ enabled libdirac   && add_cflags $(pkg-config --cflags dirac) &&
 enabled libfaac    && require2 libfaac "stdint.h faac.h" faacEncGetVersion -lfaac
 enabled libfaad    && require2 libfaad faad.h faacDecOpen -lfaad
 enabled libgsm     && require  libgsm gsm.h gsm_create -lgsm
-enabled libmp3lame && require  LAME lame/lame.h lame_init -lmp3lame -lm
+enabled libmp3lame && require  libmp3lame lame/lame.h lame_init -lmp3lame -lm
 enabled libnut     && require  libnut libnut.h nut_demuxer_init -lnut
 enabled libschroedinger && add_cflags $(pkg-config --cflags schroedinger-1.0) &&
                            require libschroedinger schroedinger/schro.h schro_init $(pkg-config --libs schroedinger-1.0)
 enabled libspeex   && require  libspeex speex/speex.h speex_decoder_init -lspeex
 enabled libtheora  && require  libtheora theora/theora.h theora_info_init -ltheora -logg
 enabled libvorbis  && require  libvorbis vorbis/vorbisenc.h vorbis_info_init -lvorbisenc -lvorbis -logg
-enabled libx264    && require  x264 x264.h x264_encoder_open -lx264 -lm &&
+enabled libx264    && require  libx264 x264.h x264_encoder_open -lx264 -lm &&
                       { check_cpp_condition x264.h "X264_BUILD >= 65" ||
                         die "ERROR: libx264 version must be >= 0.65."; }
-enabled libxvid    && require  Xvid xvid.h xvid_global -lxvidcore
+enabled libxvid    && require  libxvid xvid.h xvid_global -lxvidcore
 enabled mlib       && require  mediaLib mlib_types.h mlib_VectorSub_S16_U8_Mod -lmlib
 
 # libdc1394 check
@@ -1909,7 +1974,7 @@ texi2html -version > /dev/null 2>&1 && enable texi2html || disable texi2html
 # Network check
 
 if enabled network; then
-    check_type sys/socket.h socklen_t
+    check_type "sys/types.h sys/socket.h" socklen_t
     # Prefer arpa/inet.h over winsock2
     if check_header arpa/inet.h ; then
         check_func closesocket
@@ -2031,10 +2096,7 @@ fi
 VHOOKCFLAGS="-fPIC"
 
 # Find out if the .align argument is a power of two or not.
-if test $asmalign_pot = "unknown"; then
-    disable asmalign_pot
-    echo '__asm__ (".align 3");' | check_cc && enable asmalign_pot
-fi
+check_asm asmalign_pot '".align 3"'
 
 enabled_any $DECODER_LIST      && enable decoders
 enabled_any $ENCODER_LIST      && enable encoders
@@ -2062,11 +2124,6 @@ check_deps $CONFIG_LIST       \
            $OUTDEV_LIST       \
            $PROTOCOL_LIST     \
 
-enabled libdc1394 && append pkg_requires "libraw1394"
-enabled libdirac  && append pkg_requires "dirac"
-enabled libtheora && append pkg_requires "theora"
-enabled libvorbis && append pkg_requires "vorbisenc"
-
 echo "install prefix            $prefix"
 echo "source path               $source_path"
 echo "C compiler                $cc"
@@ -2082,14 +2139,16 @@ echo "big-endian                ${bigendian-no}"
 if test $arch = "x86_32" -o $arch = "x86_64"; then
     echo "yasm                      ${yasm-no}"
     echo "MMX enabled               ${mmx-no}"
+    echo "SSE enabled               ${sse-no}"
     echo "CMOV enabled              ${cmov-no}"
     echo "CMOV is fast              ${fast_cmov-no}"
     echo "EBX available             ${ebx_available-no}"
     echo "EBP available             ${ebp_available-no}"
 fi
-if test $arch = "armv4l"; then
+if test $arch = "arm"; then
     echo "ARMv5TE enabled           ${armv5te-no}"
     echo "ARMv6 enabled             ${armv6-no}"
+    echo "ARMv6T2 enabled           ${armv6t2-no}"
     echo "ARM VFP enabled           ${armvfp-no}"
     echo "IWMMXT enabled            ${iwmmxt-no}"
     echo "NEON enabled              ${neon-no}"
@@ -2097,7 +2156,7 @@ fi
 if test $arch = "mips"; then
     echo "MMI enabled               ${mmi-no}"
 fi
-if test $arch = "powerpc"; then
+if test $arch = "ppc"; then
     echo "AltiVec enabled           ${altivec-no}"
     echo "dcbzl available           ${dcbzl-no}"
 fi
@@ -2143,6 +2202,7 @@ echo "libtheora enabled         ${libtheora-no}"
 echo "libvorbis enabled         ${libvorbis-no}"
 echo "libx264 enabled           ${libx264-no}"
 echo "libxvid enabled           ${libxvid-no}"
+echo "vdpau enabled             ${vdpau-no}"
 echo "zlib enabled              ${zlib-no}"
 echo "bzlib enabled             ${bzlib-no}"
 echo
@@ -2209,6 +2269,12 @@ echo "SLIBSUF=$SLIBSUF" >> config.mak
 echo "EXESUF=$EXESUF" >> config.mak
 echo "EXTRA_VERSION=$extra_version" >> config.mak
 echo "DEPEND_CMD=$DEPEND_CMD" >> config.mak
+echo "HOSTCC=$host_cc" >> config.mak
+echo "HOSTCFLAGS=$host_cflags" >> config.mak
+echo "HOSTLDFLAGS=$host_ldflags" >> config.mak
+echo "HOSTLIBS=$host_libs" >> config.mak
+echo "TARGET_EXEC=$target_exec" >> config.mak
+echo "TARGET_PATH=$target_path" >> config.mak
 
 if enabled bigendian; then
     echo "WORDS_BIGENDIAN=yes" >> config.mak
@@ -2303,13 +2369,13 @@ if enabled source_path_used; then
         doc               \
         libavcodec        \
         libavcodec/alpha  \
-        libavcodec/armv4l \
+        libavcodec/arm    \
         libavcodec/bfin   \
-        libavcodec/i386   \
         libavcodec/mlib   \
         libavcodec/ppc    \
         libavcodec/sh4    \
         libavcodec/sparc  \
+        libavcodec/x86    \
         libavdevice       \
         libavfilter       \
         libavformat       \
@@ -2346,7 +2412,7 @@ fi
 
 pkgconfig_generate(){
 name=$1
-shortname=${name#lib}
+shortname=${name#lib}${build_suffix}
 comment=$2
 version=$3
 libs=$4
@@ -2384,16 +2450,16 @@ EOF
 }
 
 pkgconfig_generate libavutil "FFmpeg utility library" "$LIBAVUTIL_VERSION"
-pkgconfig_generate libavcodec "FFmpeg codec library" "$LIBAVCODEC_VERSION" "$extralibs" "$pkg_requires libavutil = $LIBAVUTIL_VERSION"
-pkgconfig_generate libavformat "FFmpeg container format library" "$LIBAVFORMAT_VERSION" "$extralibs" "$pkg_requires libavcodec = $LIBAVCODEC_VERSION"
-pkgconfig_generate libavdevice "FFmpeg device handling library" "$LIBAVDEVICE_VERSION" "$extralibs" "$pkg_requires libavformat = $LIBAVFORMAT_VERSION"
+pkgconfig_generate libavcodec "FFmpeg codec library" "$LIBAVCODEC_VERSION" "$extralibs" "libavutil = $LIBAVUTIL_VERSION"
+pkgconfig_generate libavformat "FFmpeg container format library" "$LIBAVFORMAT_VERSION" "$extralibs" "libavcodec = $LIBAVCODEC_VERSION"
+pkgconfig_generate libavdevice "FFmpeg device handling library" "$LIBAVDEVICE_VERSION" "$extralibs" "libavformat = $LIBAVFORMAT_VERSION"
 enabled avfilter &&
-    pkgconfig_generate libavfilter "FFmpeg video filtering library" "$LIBAVFILTER_VERSION" "$extralibs" "$pkg_requires libavutil = $LIBAVUTIL_VERSION"
+    pkgconfig_generate libavfilter "FFmpeg video filtering library" "$LIBAVFILTER_VERSION" "$extralibs" "libavutil = $LIBAVUTIL_VERSION"
 enabled postproc &&
     pkgconfig_generate libpostproc "FFmpeg post processing library" "$LIBPOSTPROC_VERSION"
 if enabled swscale; then
     pkgconfig_generate libswscale "FFmpeg image rescaling library" "$LIBSWSCALE_VERSION" "" "libavutil = $LIBAVUTIL_VERSION"
 else
-    pkgconfig_generate libswscale "FFmpeg image rescaling library" "$LIBSWSCALE_VERSION" "" "$pkg_requires libavcodec = $LIBAVCODEC_VERSION"
+    pkgconfig_generate libswscale "FFmpeg image rescaling library" "$LIBSWSCALE_VERSION" "" "libavcodec = $LIBAVCODEC_VERSION"
     apply libswscale/libswscale.pc sed s/^Libs:.*$/Libs:/
 fi
diff --git a/doc/faq.texi b/doc/faq.texi
index b76f4e7..e070fd2 100644
--- a/doc/faq.texi
+++ b/doc/faq.texi
@@ -166,11 +166,6 @@ Applying that to the previous example:
 
 Beware that there is no "jpeg" codec. Use "mjpeg" instead.
 
- at section I get "Unsupported codec (id=86043) for input stream #0.1". What is the problem?
-
-This is the Qcelp codec, FFmpeg has no support for that codec currently.
-Try MEncoder/MPlayer, it might work.
-
 @section Why do I see a slight quality degradation with multithreaded MPEG* encoding?
 
 For multithreaded MPEG* encoding, the encoded slices must be independent,
@@ -192,14 +187,14 @@ LGPL to GPL.
 @section Why does the chrominance data seem to be sampled at a different time from the luminance data on bt8x8 captures on Linux?
 
 This is a well-known bug in the bt8x8 driver. For 2.4.26 there is a patch at
-(@url{http://svn.mplayerhq.hu/michael/trunk/patches/bttv-420-2.4.26.patch?view=co}). This may also
+(@url{http://svn.ffmpeg.org/michael/trunk/patches/bttv-420-2.4.26.patch?view=co}). This may also
 apply cleanly to other 2.4-series kernels.
 
 @section How do I avoid the ugly aliasing artifacts in bt8x8 captures on Linux?
 
 Pass 'combfilter=1 lumafilter=1' to the bttv driver. Note though that 'combfilter=1'
-will cause somewhat too strong filtering. A fix is to apply (@url{http://svn.mplayerhq.hu/michael/trunk/patches/bttv-comb-2.4.26.patch?view=co})
-or (@url{http://svn.mplayerhq.hu/michael/trunk/patches/bttv-comb-2.6.6.patch?view=co})
+will cause somewhat too strong filtering. A fix is to apply (@url{http://svn.ffmpeg.org/michael/trunk/patches/bttv-comb-2.4.26.patch?view=co})
+or (@url{http://svn.ffmpeg.org/michael/trunk/patches/bttv-comb-2.6.6.patch?view=co})
 and pass 'combfilter=2'.
 
 @section -f jpeg doesn't work.
@@ -231,7 +226,7 @@ default.
 @item non-working stuff
 B-frames
 @item example command line
-ffmpeg -i input -acodec libfaac -ab 128kb -vcodec mpeg4 -b 1200kb -mbd 2 -flags +4mv+trell -aic 2 -cmp 2 -subcmp 2 -s 320x180 -title X output.mp4
+ffmpeg -i input -acodec libfaac -ab 128kb -vcodec mpeg4 -b 1200kb -mbd 2 -flags +4mv -trellis 2 -aic 2 -cmp 2 -subcmp 2 -s 320x180 -title X output.mp4
 @end table
 
 @section How do I encode videos which play on the PSP?
@@ -244,7 +239,7 @@ ffmpeg -i input -acodec libfaac -ab 128kb -vcodec mpeg4 -b 1200kb -mbd 2 -flags
 @item non-working stuff
 B-frames
 @item example command line
-ffmpeg -i input -acodec libfaac -ab 128kb -vcodec mpeg4 -b 1200kb -ar 24000 -mbd 2 -flags +4mv+trell -aic 2 -cmp 2 -subcmp 2 -s 368x192 -r 30000/1001 -title X -f psp output.mp4
+ffmpeg -i input -acodec libfaac -ab 128kb -vcodec mpeg4 -b 1200kb -ar 24000 -mbd 2 -flags +4mv -trellis 2 -aic 2 -cmp 2 -subcmp 2 -s 368x192 -r 30000/1001 -title X -f psp output.mp4
 @item needed stuff for H.264
 -acodec libfaac -vcodec libx264 width*height<=76800 width%16=0? height%16=0? -ar 48000 -coder 1 -r 30000/1001 or 15000/1001 -f psp
 @item working stuff for H.264
@@ -261,12 +256,12 @@ ffmpeg -i input -acodec libfaac -ab 128kb -ac 2 -ar 48000 -vcodec libx264 -level
 
 @section Which are good parameters for encoding high quality MPEG-4?
 
-'-mbd rd -flags +4mv+trell+aic -cmp 2 -subcmp 2 -g 300 -pass 1/2',
+'-mbd rd -flags +4mv+aic -trellis 2 -cmp 2 -subcmp 2 -g 300 -pass 1/2',
 things to try: '-bf 2', '-flags qprd', '-flags mv0', '-flags skiprd'.
 
 @section Which are good parameters for encoding high quality MPEG-1/MPEG-2?
 
-'-mbd rd -flags +trell -cmp 2 -subcmp 2 -g 100 -pass 1/2'
+'-mbd rd -trellis 2 -cmp 2 -subcmp 2 -g 100 -pass 1/2'
 but beware the '-g 100' might cause problems with some decoders.
 Things to try: '-bf 2', '-flags qprd', '-flags mv0', '-flags skiprd.
 
@@ -474,7 +469,7 @@ see @url{http://www.iversenit.dk/dev/ffmpeg-headers/}
 
 @section Where is the documentation about ffv1, msmpeg4, asv1, 4xm?
 
-see @url{http://svn.mplayerhq.hu/michael/trunk/docs/}
+see @url{http://svn.ffmpeg.org/michael/trunk/docs/}
 
 @section How do I feed H.263-RTP (and other codecs in RTP) to libavcodec?
 
diff --git a/doc/ffmpeg-doc.texi b/doc/ffmpeg-doc.texi
index 6f354c5..840ed89 100644
--- a/doc/ffmpeg-doc.texi
+++ b/doc/ffmpeg-doc.texi
@@ -439,10 +439,11 @@ tell that the raw codec data must be copied as is.
 Use same video quality as source (implies VBR).
 
 @item -pass @var{n}
-Select the pass number (1 or 2). It is useful to do two pass
-encoding. The statistics of the video are recorded in the first
-pass and the video is generated at the exact requested bitrate
-in the second pass.
+Select the pass number (1 or 2). It is used to do two-pass
+video encoding. The statistics of the video are recorded in the first
+pass into a log file (see also the option -passlogfile),
+and in the second pass that log file is used to generate the video
+at the exact requested bitrate.
 On pass 1, you may just deactivate audio and set output to null,
 examples for Windows and Unix:
 @example
@@ -450,8 +451,11 @@ ffmpeg -i foo.mov -vcodec libxvid -pass 1 -an -f rawvideo -y NUL
 ffmpeg -i foo.mov -vcodec libxvid -pass 1 -an -f rawvideo -y /dev/null
 @end example
 
- at item -passlogfile @var{file}
-Set two pass logfile name to @var{file}.
+ at item -passlogfile @var{prefix}
+Set two-pass log file name prefix to @var{prefix}, the default file name
+prefix is ``ffmpeg2pass''. The complete file name will be
+ at file{PREFIX-N.log}, where N is a number specific to the output
+stream.
 
 @item -newvideo
 Add a new video stream to the current output stream.
diff --git a/doc/general.texi b/doc/general.texi
index 1db8e4a..b594fe1 100644
--- a/doc/general.texi
+++ b/doc/general.texi
@@ -288,6 +288,9 @@ following image formats are supported:
     @tab fourccs: QPEG, Q1.0, Q1.1
 @item RealVideo 1.0          @tab  X  @tab  X
 @item RealVideo 2.0          @tab  X  @tab  X
+ at item RealVideo 3.0          @tab     @tab  X
+    @tab still far from ideal
+ at item RealVideo 4.0          @tab     @tab  X
 @item Renderware TXD         @tab     @tab  X
     @tab Texture dictionaries used by the Renderware Engine.
 @item RTjpeg                 @tab     @tab  X
@@ -396,6 +399,7 @@ following image formats are supported:
 @item Musepack               @tab     @tab  X
     @tab SV7 and SV8 are supported.
 @item Nellymoser ASAO        @tab  X  @tab  X
+ at item QCELP / PureVoice      @tab     @tab  X
 @item Qdesign QDM2           @tab     @tab  X
     @tab There are still some distortions.
 @item QT IMA ADPCM           @tab  X  @tab  X
@@ -477,7 +481,7 @@ are listed below:
 @itemize
 @item bash 3.1
 @item msys-make 3.81-2 (note: not mingw32-make)
- at item w32api 3.12
+ at item w32api 3.13
 @item mingw-runtime 3.15
 @end itemize
 
@@ -658,18 +662,21 @@ Then you can easily test FFmpeg with Wine
 
 @subsection Compilation under Cygwin
 
-The main issue with Cygwin is that newlib, its C library, does not
-contain llrint().  However, it is possible to leverage the
-implementation in MinGW.
+The main issue with the 1.5.x Cygwin versions is that newlib, its C library,
+does not contain llrint().  You need to upgrade to the unstable 1.7.x versions,
+or leverage the implementation in MinGW (as explained below).
 
 Just install your Cygwin with all the "Base" packages, plus the
 following "Devel" ones:
 @example
-binutils, gcc-core, make, subversion, mingw-runtime
+binutils, gcc-core, make, subversion, mingw-runtime, diffutils
 @end example
 
-Do not install binutils-20060709-1 (they are buggy on shared builds);
-use binutils-20050610-1 instead.
+The experimental gcc4 package is still buggy, hence please
+use the official gcc 3.4.4 or a 4.2.x compiled from source by yourself.
+
+Install the current binutils-20080624-2 as they work fine (the old
+binutils-20060709-1 proved buggy on shared builds).
 
 Then create a small library that just contains llrint():
 
@@ -693,9 +700,22 @@ to make a static build or
 to build shared libraries.
 
 If you want to build FFmpeg with additional libraries, download Cygwin
-"Devel" packages for Ogg and Vorbis from any Cygwin packages repository
-and/or SDL, xvid, faac, faad2 packages from Cygwin Ports,
-(@url{http://cygwinports.dotsrc.org/}).
+"Devel" packages for Ogg and Vorbis from any Cygwin packages repository:
+ at example
+libogg-devel, libvorbis-devel
+ at end example
+
+These library packages are only available from Cygwin Ports
+(@url{http://sourceware.org/cygwinports/}) :
+
+ at example
+yasm, libSDL-devel, libdirac-devel, libfaac-devel, libfaad-devel, libgsm-devel,
+libmp3lame-devel, libschroedinger1.0-devel, speex-devel, libtheora-devel,
+libxvidcore-devel
+ at end example
+
+The recommendation for libnut and x264 is to build them from source by
+yourself, as they evolve too quickly for Cygwin Ports to be up to date.
 
 @subsection Crosscompilation for Windows under Cygwin
 
@@ -1067,7 +1087,7 @@ do not attach several unrelated patches to the same mail.
     If the patch fixes a bug, did you provide enough information, including
     a sample, so the bug can be reproduced and the fix can be verified?
     Note please do not attach samples >100k to mails but rather provide a
-    URL, you can upload to ftp://upload.mplayerhq.hu
+    URL, you can upload to ftp://upload.ffmpeg.org
 @item
     Did you provide a verbose summary about what the patch does change?
 @item
diff --git a/doc/issue_tracker.txt b/doc/issue_tracker.txt
index cab56cf..e5a74db 100644
--- a/doc/issue_tracker.txt
+++ b/doc/issue_tracker.txt
@@ -15,9 +15,10 @@ be properly added to the respective issue.
 The subscription URL for the ffmpeg-issues list is:
 http://live.polito/mailman/listinfo/ffmpeg-issues
 The URL of the webinterface of the tracker is:
-http(s)://roundup.mplayerhq/roundup/ffmpeg/
+http(s)://roundup.ffmpeg/roundup/ffmpeg/
 Note the URLs in this document are obfuscated, you must append the top level
-domain of Hungary to the tracker, and of Italy to the mailing list.
+domain for non-profit organizations to the tracker, and of Italy to the
+mailing list.
 
 Email Interface:
 ----------------
diff --git a/doc/optimization.txt b/doc/optimization.txt
index 50630e7..5469adc 100644
--- a/doc/optimization.txt
+++ b/doc/optimization.txt
@@ -4,11 +4,11 @@ optimization Tips (for libavcodec):
 What to optimize:
 -----------------
 If you plan to do non-x86 architecture specific optimizations (SIMD normally),
-then take a look in the i386/ directory, as most important functions are
+then take a look in the x86/ directory, as most important functions are
 already optimized for MMX.
 
 If you want to do x86 optimizations then you can either try to finetune the
-stuff in the i386 directory or find some other functions in the C source to
+stuff in the x86 directory or find some other functions in the C source to
 optimize, but there aren't many left.
 
 
@@ -18,9 +18,9 @@ As many functions tend to be a bit difficult to understand because
 of optimizations, it can be hard to optimize them further, or write
 architecture-specific versions. It is recommended to look at older
 revisions of the interesting files (for a web frontend try ViewVC at
-http://svn.mplayerhq.hu/ffmpeg/trunk/).
+http://svn.ffmpeg.org/ffmpeg/trunk/).
 Alternatively, look into the other architecture-specific versions in
-the i386/, ppc/, alpha/ subdirectories. Even if you don't exactly
+the x86/, ppc/, alpha/ subdirectories. Even if you don't exactly
 comprehend the instructions, it could help understanding the functions
 and how they can be optimized.
 
diff --git a/ffmpeg.c b/ffmpeg.c
index 9d74491..aac071e 100644
--- a/ffmpeg.c
+++ b/ffmpeg.c
@@ -151,6 +151,7 @@ static int qp_hist = 0;
 
 static int intra_only = 0;
 static int audio_sample_rate = 44100;
+static int64_t channel_layout = 0;
 #define QSCALE_NONE -99999
 static float audio_qscale = QSCALE_NONE;
 static int audio_disable = 0;
@@ -182,7 +183,7 @@ static int do_hex_dump = 0;
 static int do_pkt_dump = 0;
 static int do_psnr = 0;
 static int do_pass = 0;
-static char *pass_logfilename = NULL;
+static char *pass_logfilename_prefix = NULL;
 static int audio_stream_copy = 0;
 static int video_stream_copy = 0;
 static int subtitle_stream_copy = 0;
@@ -190,11 +191,12 @@ static int video_sync_method= -1;
 static int audio_sync_method= 0;
 static float audio_drift_threshold= 0.1;
 static int copy_ts= 0;
-static int opt_shortest = 0; //
+static int opt_shortest = 0;
 static int video_global_header = 0;
 static char *vstats_filename;
 static FILE *vstats_file;
 static int opt_programid = 0;
+static int copy_initial_nonkeyframes = 0;
 
 static int rate_emu = 0;
 
@@ -215,7 +217,7 @@ static int64_t extra_size = 0;
 static int nb_frames_dup = 0;
 static int nb_frames_drop = 0;
 static int input_sync;
-static uint64_t limit_filesize = 0; //
+static uint64_t limit_filesize = 0;
 static int force_fps = 0;
 
 static int pgmyuv_compatibility_hack=0;
@@ -230,7 +232,7 @@ static AVBitStreamFilterContext *audio_bitstream_filters=NULL;
 static AVBitStreamFilterContext *subtitle_bitstream_filters=NULL;
 static AVBitStreamFilterContext *bitstream_filters[MAX_FILES][MAX_STREAMS];
 
-#define DEFAULT_PASS_LOGFILENAME "ffmpeg2pass"
+#define DEFAULT_PASS_LOGFILENAME_PREFIX "ffmpeg2pass"
 
 struct AVInputStream;
 
@@ -280,7 +282,6 @@ typedef struct AVInputStream {
     int64_t sample_index;      /* current sample */
 
     int64_t       start;     /* time when read started */
-    unsigned long frame;     /* current frame */
     int64_t       next_pts;  /* synthetic pts for cases where pkt.pts
                                 is not defined */
     int64_t       pts;       /* current pts */
@@ -395,9 +396,17 @@ static int av_exit(int ret)
         if (!(s->oformat->flags & AVFMT_NOFILE) && s->pb)
             url_fclose(s->pb);
         for(j=0;j<s->nb_streams;j++) {
+            av_metadata_free(&s->streams[j]->metadata);
             av_free(s->streams[j]->codec);
             av_free(s->streams[j]);
         }
+        for(j=0;j<s->nb_programs;j++) {
+            av_metadata_free(&s->programs[j]->metadata);
+        }
+        for(j=0;j<s->nb_chapters;j++) {
+            av_metadata_free(&s->chapters[j]->metadata);
+        }
+        av_metadata_free(&s->metadata);
         av_free(s);
     }
     for(i=0;i<nb_input_files;i++)
@@ -419,7 +428,7 @@ static int av_exit(int ret)
     av_free(video_standard);
 
 #ifdef CONFIG_POWERPC_PERF
-    extern void powerpc_display_perf_report(void);
+    void powerpc_display_perf_report(void);
     powerpc_display_perf_report();
 #endif /* CONFIG_POWERPC_PERF */
 
@@ -1305,13 +1314,11 @@ static int output_packet(AVInputStream *ist, int ist_index,
         }
 
         /* frame rate emulation */
-        if (ist->st->codec->rate_emu) {
-            int64_t pts = av_rescale((int64_t) ist->frame * ist->st->codec->time_base.num, 1000000, ist->st->codec->time_base.den);
+        if (rate_emu) {
+            int64_t pts = av_rescale(ist->pts, 1000000, AV_TIME_BASE);
             int64_t now = av_gettime() - ist->start;
             if (pts > now)
                 usleep(pts - now);
-
-            ist->frame++;
         }
 
         /* if output time reached then transcode raw format,
@@ -1355,7 +1362,7 @@ static int output_packet(AVInputStream *ist, int ist_index,
                         AVPacket opkt;
                         av_init_packet(&opkt);
 
-                        if (!ost->frame_number && !(pkt->flags & PKT_FLAG_KEY))
+                        if ((!ost->frame_number && !(pkt->flags & PKT_FLAG_KEY)) && !copy_initial_nonkeyframes)
                             continue;
 
                         /* no reencoding needed : output the packet directly */
@@ -1402,8 +1409,9 @@ static int output_packet(AVInputStream *ist, int ist_index,
         if (subtitle_to_free) {
             if (subtitle_to_free->rects != NULL) {
                 for (i = 0; i < subtitle_to_free->num_rects; i++) {
-                    av_free(subtitle_to_free->rects[i].bitmap);
-                    av_free(subtitle_to_free->rects[i].rgba_palette);
+                    av_freep(&subtitle_to_free->rects[i]->pict.data[0]);
+                    av_freep(&subtitle_to_free->rects[i]->pict.data[1]);
+                    av_freep(&subtitle_to_free->rects[i]);
                 }
                 av_freep(&subtitle_to_free->rects);
             }
@@ -1488,6 +1496,7 @@ static void print_sdp(AVFormatContext **avc, int n)
 
     avf_sdp_create(avc, n, sdp, sizeof(sdp));
     printf("SDP:\n%s\n", sdp);
+    fflush(stdout);
 }
 
 static int stream_index_from_inputs(AVFormatContext **input_files,
@@ -1569,9 +1578,8 @@ static int av_encode(AVFormatContext **output_files,
             ist->discard = 1; /* the stream is discarded by default
                                  (changed later) */
 
-            if (ist->st->codec->rate_emu) {
+            if (rate_emu) {
                 ist->start = av_gettime();
-                ist->frame = 0;
             }
         }
     }
@@ -1581,7 +1589,8 @@ static int av_encode(AVFormatContext **output_files,
     for(i=0;i<nb_output_files;i++) {
         os = output_files[i];
         if (!os->nb_streams) {
-            fprintf(stderr, "Output file does not contain any stream\n");
+            dump_format(output_files[i], i, output_files[i]->filename, 1);
+            fprintf(stderr, "Output file #%d does not contain any stream\n", i);
             av_exit(1);
         }
         nb_ostreams += os->nb_streams;
@@ -1733,6 +1742,7 @@ static int av_encode(AVFormatContext **output_files,
                     fprintf(stderr,"-acodec copy and -vol are incompatible (frames are not decoded)\n");
                     av_exit(1);
                 }
+                codec->channel_layout = icodec->channel_layout;
                 codec->sample_rate = icodec->sample_rate;
                 codec->channels = icodec->channels;
                 codec->frame_size = icodec->frame_size;
@@ -1836,12 +1846,12 @@ static int av_encode(AVFormatContext **output_files,
                 char *logbuffer;
 
                 snprintf(logfilename, sizeof(logfilename), "%s-%d.log",
-                         pass_logfilename ?
-                         pass_logfilename : DEFAULT_PASS_LOGFILENAME, i);
+                         pass_logfilename_prefix ? pass_logfilename_prefix : DEFAULT_PASS_LOGFILENAME_PREFIX,
+                         i);
                 if (codec->flags & CODEC_FLAG_PASS1) {
                     f = fopen(logfilename, "w");
                     if (!f) {
-                        perror(logfilename);
+                        fprintf(stderr, "Cannot write log file '%s' for pass-1 encoding: %s\n", logfilename, strerror(errno));
                         av_exit(1);
                     }
                     ost->logfile = f;
@@ -1849,7 +1859,7 @@ static int av_encode(AVFormatContext **output_files,
                     /* read the log file */
                     f = fopen(logfilename, "r");
                     if (!f) {
-                        perror(logfilename);
+                        fprintf(stderr, "Cannot read log file '%s' for pass-2 encoding: %s\n", logfilename, strerror(errno));
                         av_exit(1);
                     }
                     fseek(f, 0, SEEK_END);
@@ -2821,6 +2831,7 @@ static void opt_input_file(const char *filename)
         case CODEC_TYPE_AUDIO:
             set_context_opts(enc, avctx_opts[CODEC_TYPE_AUDIO], AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_DECODING_PARAM);
             //fprintf(stderr, "\nInput Audio channels: %d", enc->channels);
+            channel_layout = enc->channel_layout;
             audio_channels = enc->channels;
             audio_sample_rate = enc->sample_rate;
             audio_sample_fmt = enc->sample_fmt;
@@ -2856,7 +2867,6 @@ static void opt_input_file(const char *filename)
             frame_rate.num = rfps;
             frame_rate.den = rfps_base;
 
-            enc->rate_emu = rate_emu;
             input_codecs[nb_icodecs++] = avcodec_find_decoder_by_name(video_codec_name);
             if(video_disable)
                 ic->streams[i]->discard= AVDISCARD_ALL;
@@ -2891,7 +2901,6 @@ static void opt_input_file(const char *filename)
 
     video_channel = 0;
 
-    rate_emu = 0;
     av_freep(&video_codec_name);
     av_freep(&audio_codec_name);
     av_freep(&subtitle_codec_name);
@@ -3130,6 +3139,7 @@ static void new_audio_stream(AVFormatContext *oc)
         audio_enc->thread_count = thread_count;
         audio_enc->channels = audio_channels;
         audio_enc->sample_fmt = audio_sample_fmt;
+        audio_enc->channel_layout = channel_layout;
 
         if(codec && codec->sample_fmts){
             const enum SampleFormat *p= codec->sample_fmts;
@@ -3702,7 +3712,7 @@ static int opt_preset(const char *opt, const char *arg)
             continue;
         e|= sscanf(line, "%999[^=]=%999[^\n]\n", tmp, tmp2) - 2;
         if(e){
-            fprintf(stderr, "%s: Preset file invalid\n", filename);
+            fprintf(stderr, "%s: Invalid syntax: '%s'\n", filename, line);
             av_exit(1);
         }
         if(!strcmp(tmp, "acodec")){
@@ -3712,7 +3722,7 @@ static int opt_preset(const char *opt, const char *arg)
         }else if(!strcmp(tmp, "scodec")){
             opt_subtitle_codec(tmp2);
         }else if(opt_default(tmp, tmp2) < 0){
-            fprintf(stderr, "%s: Invalid option or argument: %s=%s\n", filename, tmp, tmp2);
+            fprintf(stderr, "%s: Invalid option or argument: '%s', parsed as '%s' = '%s'\n", filename, line, tmp, tmp2);
             av_exit(1);
         }
     }
@@ -3767,6 +3777,7 @@ static const OptionDef options[] = {
     { "dts_delta_threshold", HAS_ARG | OPT_FLOAT | OPT_EXPERT, {(void*)&dts_delta_threshold}, "timestamp discontinuity delta threshold", "threshold" },
     { "programid", HAS_ARG | OPT_INT | OPT_EXPERT, {(void*)&opt_programid}, "desired program number", "" },
     { "xerror", OPT_BOOL, {(void*)&exit_on_error}, "exit on error", "error" },
+    { "copyinkf", OPT_BOOL | OPT_EXPERT, {(void*)&copy_initial_nonkeyframes}, "copy initial non-keyframes" },
 
     /* video options */
     { "b", OPT_FUNC2 | HAS_ARG | OPT_VIDEO, {(void*)opt_bitrate}, "set bitrate (in bits/s)", "bitrate" },
@@ -3795,7 +3806,7 @@ static const OptionDef options[] = {
     { "sameq", OPT_BOOL | OPT_VIDEO, {(void*)&same_quality},
       "use same video quality as source (implies VBR)" },
     { "pass", HAS_ARG | OPT_VIDEO, {(void*)&opt_pass}, "select the pass number (1 or 2)", "n" },
-    { "passlogfile", HAS_ARG | OPT_STRING | OPT_VIDEO, {(void*)&pass_logfilename}, "select two pass log file name", "file" },
+    { "passlogfile", HAS_ARG | OPT_STRING | OPT_VIDEO, {(void*)&pass_logfilename_prefix}, "select two pass log file name prefix", "prefix" },
     { "deinterlace", OPT_BOOL | OPT_EXPERT | OPT_VIDEO, {(void*)&do_deinterlace},
       "deinterlace pictures" },
     { "psnr", OPT_BOOL | OPT_EXPERT | OPT_VIDEO, {(void*)&do_psnr}, "calculate PSNR of compressed frames" },
@@ -3873,22 +3884,18 @@ int main(int argc, char **argv)
     sws_opts = sws_getContext(16,16,0, 16,16,0, sws_flags, NULL,NULL,NULL);
 
     show_banner();
-    if (argc <= 1) {
-        show_help();
-        av_exit(1);
-    }
 
     /* parse options */
     parse_options(argc, argv, options, opt_output_file);
 
     /* file converter / grab */
     if (nb_output_files <= 0) {
-        fprintf(stderr, "Must supply at least one output file\n");
+        fprintf(stderr, "At least one output file must be specified\n");
         av_exit(1);
     }
 
     if (nb_input_files == 0) {
-        fprintf(stderr, "Must supply at least one input file\n");
+        fprintf(stderr, "At least one input file must be specified\n");
         av_exit(1);
     }
 
diff --git a/ffplay.c b/ffplay.c
index 67fbff2..12190da 100644
--- a/ffplay.c
+++ b/ffplay.c
@@ -441,10 +441,10 @@ static void blend_subrect(AVPicture *dst, const AVSubtitleRect *rect, int imgw,
     const uint32_t *pal;
     int dstx, dsty, dstw, dsth;
 
-    dstx = FFMIN(FFMAX(rect->x, 0), imgw);
-    dstw = FFMIN(FFMAX(rect->w, 0), imgw - dstx);
-    dsty = FFMIN(FFMAX(rect->y, 0), imgh);
-    dsth = FFMIN(FFMAX(rect->h, 0), imgh - dsty);
+    dstw = av_clip(rect->w, 0, imgw);
+    dsth = av_clip(rect->h, 0, imgh);
+    dstx = av_clip(rect->x, 0, imgw - dstw);
+    dsty = av_clip(rect->y, 0, imgh - dsth);
     lum = dst->data[0] + dsty * dst->linesize[0];
     cb = dst->data[1] + (dsty >> 1) * dst->linesize[1];
     cr = dst->data[2] + (dsty >> 1) * dst->linesize[2];
@@ -452,9 +452,9 @@ static void blend_subrect(AVPicture *dst, const AVSubtitleRect *rect, int imgw,
     width2 = (dstw + 1) >> 1;
     skip2 = dstx >> 1;
     wrap = dst->linesize[0];
-    wrap3 = rect->linesize;
-    p = rect->bitmap;
-    pal = rect->rgba_palette;  /* Now in YCrCb! */
+    wrap3 = rect->pict.linesize[0];
+    p = rect->pict.data[0];
+    pal = (const uint32_t *)rect->pict.data[1];  /* Now in YCrCb! */
 
     if (dsty & 1) {
         lum += dstx;
@@ -636,8 +636,9 @@ static void free_subpicture(SubPicture *sp)
 
     for (i = 0; i < sp->sub.num_rects; i++)
     {
-        av_free(sp->sub.rects[i].bitmap);
-        av_free(sp->sub.rects[i].rgba_palette);
+        av_freep(&sp->sub.rects[i]->pict.data[0]);
+        av_freep(&sp->sub.rects[i]->pict.data[1]);
+        av_freep(&sp->sub.rects[i]);
     }
 
     av_free(sp->sub.rects);
@@ -721,7 +722,7 @@ static void video_image_display(VideoState *is)
                     pict.linesize[2] = vp->bmp->pitches[1];
 
                     for (i = 0; i < sp->sub.num_rects; i++)
-                        blend_subrect(&pict, &sp->sub.rects[i],
+                        blend_subrect(&pict, sp->sub.rects[i],
                                       vp->bmp->w, vp->bmp->h);
 
                     SDL_UnlockYUVOverlay (vp->bmp);
@@ -1024,7 +1025,7 @@ static void video_refresh_timer(void *opaque)
 
             /* compute nominal delay */
             delay = vp->pts - is->frame_last_pts;
-            if (delay <= 0 || delay >= 2.0) {
+            if (delay <= 0 || delay >= 10.0) {
                 /* if incorrect delay, use previous one */
                 delay = is->frame_last_delay;
             }
@@ -1435,13 +1436,13 @@ static int subtitle_thread(void *arg)
 
             for (i = 0; i < sp->sub.num_rects; i++)
             {
-                for (j = 0; j < sp->sub.rects[i].nb_colors; j++)
+                for (j = 0; j < sp->sub.rects[i]->nb_colors; j++)
                 {
-                    RGBA_IN(r, g, b, a, sp->sub.rects[i].rgba_palette + j);
+                    RGBA_IN(r, g, b, a, (uint32_t*)sp->sub.rects[i]->pict.data[1] + j);
                     y = RGB_TO_Y_CCIR(r, g, b);
                     u = RGB_TO_U_CCIR(r, g, b, 0);
                     v = RGB_TO_V_CCIR(r, g, b, 0);
-                    YUVA_OUT(sp->sub.rects[i].rgba_palette + j, y, u, v, a);
+                    YUVA_OUT((uint32_t*)sp->sub.rects[i]->pict.data[1] + j, y, u, v, a);
                 }
             }
 
@@ -2062,12 +2063,19 @@ static int decode_thread(void *arg)
         /* if the queue are full, no need to read more */
         if (is->audioq.size > MAX_AUDIOQ_SIZE ||
             is->videoq.size > MAX_VIDEOQ_SIZE ||
-            is->subtitleq.size > MAX_SUBTITLEQ_SIZE ||
-            url_feof(ic->pb)) {
+            is->subtitleq.size > MAX_SUBTITLEQ_SIZE) {
             /* wait 10 ms */
             SDL_Delay(10);
             continue;
         }
+        if(url_feof(ic->pb)) {
+            av_init_packet(pkt);
+            pkt->data=
+            pkt->size=0;
+            pkt->stream_index= is->video_stream;
+            packet_queue_put(&is->videoq, pkt);
+            continue;
+        }
         ret = av_read_frame(ic, pkt);
         if (ret < 0) {
             if (url_ferror(ic->pb) == 0) {
@@ -2553,7 +2561,7 @@ int main(int argc, char **argv)
     parse_options(argc, argv, options, opt_input_file);
 
     if (!input_filename) {
-        show_help();
+        fprintf(stderr, "An input file must be specified\n");
         exit(1);
     }
 
diff --git a/ffpresets/libx264-fastfirstpass.ffpreset b/ffpresets/libx264-fastfirstpass.ffpreset
index f2ce57b..aaad461 100644
--- a/ffpresets/libx264-fastfirstpass.ffpreset
+++ b/ffpresets/libx264-fastfirstpass.ffpreset
@@ -14,7 +14,8 @@ qcomp=0.6
 qmin=10
 qmax=51
 qdiff=4
+bf=4
 refs=1
-directpred=1
+directpred=3
 trellis=0
 flags2=-bpyramid-wpred-mixed_refs-dct8x8+fastpskip
diff --git a/ffpresets/libx264-hq.ffpreset b/ffpresets/libx264-hq.ffpreset
index 3f6cda6..cdf67eb 100644
--- a/ffpresets/libx264-hq.ffpreset
+++ b/ffpresets/libx264-hq.ffpreset
@@ -9,12 +9,12 @@ g=250
 keyint_min=25
 sc_threshold=40
 i_qfactor=0.71
-b_strategy=1
+b_strategy=2
 qcomp=0.6
 qmin=10
 qmax=51
 qdiff=4
-bf=16
+bf=4
 refs=4
 directpred=3
 trellis=1
diff --git a/ffpresets/libx264-lossless_fast.ffpreset b/ffpresets/libx264-lossless_fast.ffpreset
new file mode 100644
index 0000000..dcf418a
--- /dev/null
+++ b/ffpresets/libx264-lossless_fast.ffpreset
@@ -0,0 +1,19 @@
+coder=0
+flags=+loop
+cmp=+chroma
+partitions=-parti8x8+parti4x4+partp8x8-partp4x4-partb8x8
+me_method=hex
+subq=3
+me_range=16
+g=250
+keyint_min=25
+sc_threshold=40
+i_qfactor=0.71
+b_strategy=1
+qcomp=0.6
+qmin=10
+qmax=51
+qdiff=4
+directpred=1
+flags2=+fastpskip
+cqp=0
diff --git a/ffpresets/libx264-lossless_max.ffpreset b/ffpresets/libx264-lossless_max.ffpreset
new file mode 100644
index 0000000..b8506c2
--- /dev/null
+++ b/ffpresets/libx264-lossless_max.ffpreset
@@ -0,0 +1,20 @@
+coder=1
+flags=+loop
+cmp=+chroma
+partitions=+parti8x8+parti4x4+partp8x8+partp4x4-partb8x8
+me_method=esa
+subq=8
+me_range=16
+g=250
+keyint_min=25
+sc_threshold=40
+i_qfactor=0.71
+b_strategy=1
+qcomp=0.6
+qmin=10
+qmax=51
+qdiff=4
+refs=16
+directpred=1
+flags2=+mixed_refs+dct8x8+fastpskip
+cqp=0
diff --git a/ffpresets/libx264-lossless_medium.ffpreset b/ffpresets/libx264-lossless_medium.ffpreset
new file mode 100644
index 0000000..99fb6b9
--- /dev/null
+++ b/ffpresets/libx264-lossless_medium.ffpreset
@@ -0,0 +1,19 @@
+coder=1
+flags=+loop
+cmp=+chroma
+partitions=-parti8x8+parti4x4+partp8x8+partp4x4-partb8x8
+me_method=hex
+subq=5
+me_range=16
+g=250
+keyint_min=25
+sc_threshold=40
+i_qfactor=0.71
+b_strategy=1
+qcomp=0.6
+qmin=10
+qmax=51
+qdiff=4
+directpred=1
+flags2=+fastpskip
+cqp=0
diff --git a/ffpresets/libx264-lossless_slow.ffpreset b/ffpresets/libx264-lossless_slow.ffpreset
new file mode 100644
index 0000000..2ecb55b
--- /dev/null
+++ b/ffpresets/libx264-lossless_slow.ffpreset
@@ -0,0 +1,20 @@
+coder=1
+flags=+loop
+cmp=+chroma
+partitions=+parti8x8+parti4x4+partp8x8+partp4x4-partb8x8
+me_method=umh
+subq=6
+me_range=16
+g=250
+keyint_min=25
+sc_threshold=40
+i_qfactor=0.71
+b_strategy=1
+qcomp=0.6
+qmin=10
+qmax=51
+qdiff=4
+refs=2
+directpred=1
+flags2=+dct8x8+fastpskip
+cqp=0
diff --git a/ffpresets/libx264-lossless_slower.ffpreset b/ffpresets/libx264-lossless_slower.ffpreset
new file mode 100644
index 0000000..dd499c7
--- /dev/null
+++ b/ffpresets/libx264-lossless_slower.ffpreset
@@ -0,0 +1,20 @@
+coder=1
+flags=+loop
+cmp=+chroma
+partitions=+parti8x8+parti4x4+partp8x8+partp4x4-partb8x8
+me_method=umh
+subq=8
+me_range=16
+g=250
+keyint_min=25
+sc_threshold=40
+i_qfactor=0.71
+b_strategy=1
+qcomp=0.6
+qmin=10
+qmax=51
+qdiff=4
+refs=4
+directpred=1
+flags2=+mixed_refs+dct8x8+fastpskip
+cqp=0
diff --git a/ffpresets/libx264-lossless_ultrafast.ffpreset b/ffpresets/libx264-lossless_ultrafast.ffpreset
new file mode 100644
index 0000000..1c429f2
--- /dev/null
+++ b/ffpresets/libx264-lossless_ultrafast.ffpreset
@@ -0,0 +1,19 @@
+coder=0
+flags=+loop
+cmp=+chroma
+partitions=-parti8x8-parti4x4-partp8x8-partp4x4-partb8x8
+me_method=dia
+subq=0
+me_range=16
+g=250
+keyint_min=25
+sc_threshold=40
+i_qfactor=0.71
+b_strategy=1
+qcomp=0.6
+qmin=10
+qmax=51
+qdiff=4
+directpred=1
+flags2=+fastpskip
+cqp=0
diff --git a/ffpresets/libx264-max.ffpreset b/ffpresets/libx264-max.ffpreset
index 3638480..afdde53 100644
--- a/ffpresets/libx264-max.ffpreset
+++ b/ffpresets/libx264-max.ffpreset
@@ -9,12 +9,12 @@ g=250
 keyint_min=25
 sc_threshold=40
 i_qfactor=0.71
-b_strategy=1
+b_strategy=2
 qcomp=0.6
 qmin=10
 qmax=51
 qdiff=4
-bf=16
+bf=4
 refs=16
 directpred=3
 trellis=2
diff --git a/ffpresets/libx264-normal.ffpreset b/ffpresets/libx264-normal.ffpreset
index 180b9d2..99ac2e3 100644
--- a/ffpresets/libx264-normal.ffpreset
+++ b/ffpresets/libx264-normal.ffpreset
@@ -14,7 +14,7 @@ qcomp=0.6
 qmin=10
 qmax=51
 qdiff=4
-bf=16
+bf=4
 refs=2
 directpred=3
 trellis=0
diff --git a/ffpresets/libx264-slowfirstpass.ffpreset b/ffpresets/libx264-slowfirstpass.ffpreset
new file mode 100644
index 0000000..7358d44
--- /dev/null
+++ b/ffpresets/libx264-slowfirstpass.ffpreset
@@ -0,0 +1,21 @@
+coder=1
+flags=+loop
+cmp=+chroma
+partitions=+parti8x8+parti4x4+partp8x8+partb8x8
+me_method=hex
+subq=6
+me_range=16
+g=250
+keyint_min=25
+sc_threshold=40
+i_qfactor=0.71
+b_strategy=2
+qcomp=0.6
+qmin=10
+qmax=51
+qdiff=4
+bf=4
+refs=1
+directpred=3
+trellis=0
+flags2=+bpyramid+wpred+dct8x8+fastpskip
diff --git a/ffserver.c b/ffserver.c
index 168f996..23bb64c 100644
--- a/ffserver.c
+++ b/ffserver.c
@@ -3507,7 +3507,7 @@ static void build_feed_streams(void)
             }
         }
         if (!url_exist(feed->feed_filename)) {
-            AVFormatContext s1, *s = &s1;
+            AVFormatContext s1 = {0}, *s = &s1;
 
             if (feed->readonly) {
                 http_log("Unable to create feed file '%s' as it is marked readonly\n",
@@ -3686,7 +3686,7 @@ static void add_codec(FFStream *stream, AVCodecContext *av)
     memcpy(st->codec, av, sizeof(AVCodecContext));
 }
 
-static int opt_audio_codec(const char *arg)
+static enum CodecID opt_audio_codec(const char *arg)
 {
     AVCodec *p= avcodec_find_encoder_by_name(arg);
 
@@ -3696,7 +3696,7 @@ static int opt_audio_codec(const char *arg)
     return p->id;
 }
 
-static int opt_video_codec(const char *arg)
+static enum CodecID opt_video_codec(const char *arg)
 {
     AVCodec *p= avcodec_find_encoder_by_name(arg);
 
@@ -3735,13 +3735,11 @@ static void load_module(const char *filename)
 static int ffserver_opt_default(const char *opt, const char *arg,
                        AVCodecContext *avctx, int type)
 {
-    const AVOption *o  = NULL;
-    const AVOption *o2 = av_find_opt(avctx, opt, NULL, type, type);
-    if(o2)
-        o = av_set_string2(avctx, opt, arg, 1);
-    if(!o)
-        return -1;
-    return 0;
+    int ret = 0;
+    const AVOption *o = av_find_opt(avctx, opt, NULL, type, type);
+    if(o)
+        ret = av_set_string3(avctx, opt, arg, 1, NULL);
+    return ret;
 }
 
 static int parse_ffconfig(const char *filename)
@@ -3755,7 +3753,7 @@ static int parse_ffconfig(const char *filename)
     FFStream **last_stream, *stream, *redirect;
     FFStream **last_feed, *feed;
     AVCodecContext audio_enc, video_enc;
-    int audio_id, video_id;
+    enum CodecID audio_id, video_id;
 
     f = fopen(filename, "r");
     if (!f) {
@@ -4450,7 +4448,7 @@ static void opt_debug()
 
 static void opt_show_help(void)
 {
-    printf("usage: FFserver [options]\n"
+    printf("usage: ffserver [options]\n"
            "Hyper fast multi format Audio/Video streaming server\n");
     printf("\n");
     show_help_options(options, "Main options:\n", 0, 0);
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 0540c20..3b179d7 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -3,6 +3,8 @@ include $(SUBDIR)../config.mak
 NAME = avcodec
 FFLIBS = avutil
 
+HEADERS = avcodec.h opt.h vdpau.h
+
 OBJS = allcodecs.o                                                      \
        audioconvert.o                                                   \
        bitstream.o                                                      \
@@ -20,14 +22,15 @@ OBJS = allcodecs.o                                                      \
        simple_idct.o                                                    \
        utils.o                                                          \
 
-
-HEADERS = avcodec.h opt.h
-
+# parts needed for many different codecs
+OBJS-$(CONFIG_AANDCT)                  += aandcttab.o
 OBJS-$(CONFIG_ENCODERS)                += faandct.o jfdctfst.o jfdctint.o
 OBJS-$(CONFIG_FFT)                     += fft.o
 OBJS-$(CONFIG_GOLOMB)                  += golomb.o
 OBJS-$(CONFIG_MDCT)                    += mdct.o
+OBJS-$(CONFIG_OLDSCALER)               += imgresample.o
 
+# decoders/encoders
 OBJS-$(CONFIG_AAC_DECODER)             += aac.o aactab.o
 OBJS-$(CONFIG_AASC_DECODER)            += aasc.o msrledec.o
 OBJS-$(CONFIG_AC3_DECODER)             += eac3dec.o ac3dec.o ac3tab.o ac3dec_data.o ac3.o
@@ -97,6 +100,7 @@ OBJS-$(CONFIG_H263_ENCODER)            += mpegvideo_enc.o motion_est.o ratecontr
 OBJS-$(CONFIG_H263P_ENCODER)           += mpegvideo_enc.o motion_est.o ratecontrol.o h263.o mpeg12data.o mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_H264_DECODER)            += h264.o h264idct.o h264pred.o h264_parser.o cabac.o mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_H264_ENCODER)            += h264enc.o h264dspenc.o
+OBJS-$(CONFIG_H264_VDPAU_DECODER)      += vdpauvideo.o
 OBJS-$(CONFIG_HUFFYUV_DECODER)         += huffyuv.o
 OBJS-$(CONFIG_HUFFYUV_ENCODER)         += huffyuv.o
 OBJS-$(CONFIG_IDCIN_DECODER)           += idcinvideo.o
@@ -155,6 +159,7 @@ OBJS-$(CONFIG_PNG_DECODER)             += png.o pngdec.o
 OBJS-$(CONFIG_PNG_ENCODER)             += png.o pngenc.o
 OBJS-$(CONFIG_PPM_ENCODER)             += pnmenc.o pnm.o
 OBJS-$(CONFIG_PTX_DECODER)             += ptx.o
+OBJS-$(CONFIG_QCELP_DECODER)           += qcelpdec.o qcelp_lsp.o celp_math.o celp_filters.o
 OBJS-$(CONFIG_QDM2_DECODER)            += qdm2.o mpegaudiodec.o mpegaudiodecheader.o mpegaudio.o mpegaudiodata.o
 OBJS-$(CONFIG_QDRAW_DECODER)           += qdrw.o
 OBJS-$(CONFIG_QPEG_DECODER)            += qpeg.o
@@ -174,6 +179,8 @@ OBJS-$(CONFIG_RV10_DECODER)            += rv10.o h263.o mpeg12data.o mpegvideo.o
 OBJS-$(CONFIG_RV10_ENCODER)            += rv10.o mpegvideo_enc.o motion_est.o ratecontrol.o h263.o mpeg12data.o mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_RV20_DECODER)            += rv10.o h263.o mpeg12data.o mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_RV20_ENCODER)            += rv10.o mpegvideo_enc.o motion_est.o ratecontrol.o h263.o mpeg12data.o mpegvideo.o error_resilience.o
+OBJS-$(CONFIG_RV30_DECODER)            += rv30.o rv34.o h264pred.o rv30dsp.o
+OBJS-$(CONFIG_RV40_DECODER)            += rv40.o rv34.o h264pred.o rv40dsp.o
 OBJS-$(CONFIG_SGI_DECODER)             += sgidec.o
 OBJS-$(CONFIG_SGI_ENCODER)             += sgienc.o rle.o
 OBJS-$(CONFIG_SHORTEN_DECODER)         += shorten.o
@@ -190,13 +197,13 @@ OBJS-$(CONFIG_SP5X_DECODER)            += sp5xdec.o mjpegdec.o mjpeg.o
 OBJS-$(CONFIG_SUNRAST_DECODER)         += sunrast.o
 OBJS-$(CONFIG_SVQ1_DECODER)            += svq1dec.o svq1.o h263.o mpeg12data.o mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += svq1enc.o svq1.o motion_est.o h263.o mpeg12data.o mpegvideo.o error_resilience.o
-OBJS-$(CONFIG_SVQ3_DECODER)            += h264.o h264idct.o h264pred.o h264_parser.o cabac.o mpegvideo.o error_resilience.o
+OBJS-$(CONFIG_SVQ3_DECODER)            += h264.o h264idct.o h264pred.o h264_parser.o cabac.o mpegvideo.o error_resilience.o svq1dec.o
 OBJS-$(CONFIG_TARGA_DECODER)           += targa.o
 OBJS-$(CONFIG_TARGA_ENCODER)           += targaenc.o rle.o
 OBJS-$(CONFIG_THEORA_DECODER)          += vp3.o xiph.o vp3dsp.o
 OBJS-$(CONFIG_THP_DECODER)             += mjpegdec.o mjpeg.o
 OBJS-$(CONFIG_TIERTEXSEQVIDEO_DECODER) += tiertexseqv.o
-OBJS-$(CONFIG_TIFF_DECODER)            += tiff.o lzw.o
+OBJS-$(CONFIG_TIFF_DECODER)            += tiff.o lzw.o faxcompr.o
 OBJS-$(CONFIG_TIFF_ENCODER)            += tiffenc.o rle.o lzwenc.o
 OBJS-$(CONFIG_TRUEMOTION1_DECODER)     += truemotion1.o
 OBJS-$(CONFIG_TRUEMOTION2_DECODER)     += truemotion2.o
@@ -237,11 +244,13 @@ OBJS-$(CONFIG_XAN_WC3_DECODER)         += xan.o
 OBJS-$(CONFIG_XAN_WC4_DECODER)         += xan.o
 OBJS-$(CONFIG_XL_DECODER)              += xl.o
 OBJS-$(CONFIG_XSUB_DECODER)            += xsubdec.o
+OBJS-$(CONFIG_XVMC)                    += xvmcvideo.o
 OBJS-$(CONFIG_ZLIB_DECODER)            += lcldec.o
 OBJS-$(CONFIG_ZLIB_ENCODER)            += lclenc.o
 OBJS-$(CONFIG_ZMBV_DECODER)            += zmbv.o
 OBJS-$(CONFIG_ZMBV_ENCODER)            += zmbvenc.o
 
+# (AD)PCM decoders/encoders
 OBJS-$(CONFIG_PCM_ALAW_DECODER)           += pcm.o
 OBJS-$(CONFIG_PCM_ALAW_ENCODER)           += pcm.o
 OBJS-$(CONFIG_PCM_DVD_DECODER)            += pcm.o
@@ -344,7 +353,7 @@ OBJS-$(CONFIG_LIBVORBIS)               += libvorbis.o
 OBJS-$(CONFIG_LIBX264)                 += libx264.o
 OBJS-$(CONFIG_LIBXVID)                 += libxvidff.o libxvid_rc.o
 
-
+# parsers
 OBJS-$(CONFIG_AAC_PARSER)              += aac_parser.o aac_ac3_parser.o mpeg4audio.o
 OBJS-$(CONFIG_AC3_PARSER)              += ac3_parser.o ac3tab.o aac_ac3_parser.o
 OBJS-$(CONFIG_CAVSVIDEO_PARSER)        += cavs_parser.o
@@ -365,6 +374,7 @@ OBJS-$(CONFIG_PNM_PARSER)              += pnm_parser.o pnm.o
 OBJS-$(CONFIG_VC1_PARSER)              += vc1_parser.o
 OBJS-$(CONFIG_VP3_PARSER)              += vp3_parser.o
 
+# bitstream filters
 OBJS-$(CONFIG_DUMP_EXTRADATA_BSF)         += dump_extradata_bsf.o
 OBJS-$(CONFIG_H264_MP4TOANNEXB_BSF)       += h264_mp4toannexb_bsf.o
 OBJS-$(CONFIG_IMX_DUMP_HEADER_BSF)        += imx_dump_header_bsf.o
@@ -376,91 +386,97 @@ OBJS-$(CONFIG_NOISE_BSF)                  += noise_bsf.o
 OBJS-$(CONFIG_REMOVE_EXTRADATA_BSF)       += remove_extradata_bsf.o
 OBJS-$(CONFIG_TEXT2MOVSUB_BSF)            += movsub_bsf.o
 
+# thread libraries
 OBJS-$(HAVE_BEOSTHREADS)               += beosthread.o
 OBJS-$(HAVE_OS2THREADS)                += os2thread.o
 OBJS-$(HAVE_PTHREADS)                  += pthread.o
 OBJS-$(HAVE_W32THREADS)                += w32thread.o
 
-OBJS-$(HAVE_XVMC)                      += xvmcvideo.o
-
-ifndef CONFIG_SWSCALE
-OBJS += imgresample.o
-endif
-
 # processor-specific code
-ifdef HAVE_MMX
-OBJS += i386/fdct_mmx.o                                                 \
-        i386/cpuid.o                                                    \
-        i386/dsputil_mmx.o                                              \
-        i386/mpegvideo_mmx.o                                            \
-        i386/motion_est_mmx.o                                           \
-        i386/simple_idct_mmx.o                                          \
-        i386/idct_mmx_xvid.o                                            \
-        i386/idct_sse2_xvid.o                                           \
-
-OBJS-$(CONFIG_FFT_MMX)                 += i386/fft_mmx.o                \
-                                          i386/fft_sse.o                \
-                                          i386/fft_3dn.o                \
-                                          i386/fft_3dn2.o               \
-
-OBJS-$(HAVE_YASM)                      += i386/dsputil_yasm.o
-
-OBJS-$(CONFIG_GPL)                     += i386/idct_mmx.o
-
-OBJS-$(CONFIG_ENCODERS)                += i386/dsputilenc_mmx.o
-
-OBJS-$(CONFIG_CAVS_DECODER)            += i386/cavsdsp_mmx.o
-OBJS-$(CONFIG_FLAC_ENCODER)            += i386/flacdsp_mmx.o
-OBJS-$(CONFIG_SNOW_DECODER)            += i386/snowdsp_mmx.o
-OBJS-$(CONFIG_VC1_DECODER)             += i386/vc1dsp_mmx.o
-OBJS-$(CONFIG_VP3_DECODER)             += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
-OBJS-$(CONFIG_VP5_DECODER)             += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
-OBJS-$(CONFIG_VP6_DECODER)             += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
-OBJS-$(CONFIG_VP6A_DECODER)            += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
-OBJS-$(CONFIG_VP6F_DECODER)            += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o
-OBJS-$(CONFIG_WMV3_DECODER)            += i386/vc1dsp_mmx.o
-endif
-
-OBJS-$(ARCH_ARMV4L)                    += armv4l/jrevdct_arm.o          \
-                                          armv4l/simple_idct_arm.o      \
-                                          armv4l/dsputil_arm_s.o        \
-                                          armv4l/dsputil_arm.o          \
-                                          armv4l/mpegvideo_arm.o        \
-
-OBJS-$(HAVE_IWMMXT)                    += armv4l/dsputil_iwmmxt.o       \
-                                          armv4l/mpegvideo_iwmmxt.o     \
-
-OBJS-$(HAVE_ARMV5TE)                   += armv4l/mpegvideo_armv5te.o    \
-                                          armv4l/simple_idct_armv5te.o  \
-
-OBJS-$(HAVE_ARMVFP)                    += armv4l/float_arm_vfp.o        \
-                                          armv4l/dsputil_vfp.o          \
-
-OBJS-$(HAVE_ARMV6)                     += armv4l/simple_idct_armv6.o    \
-
-OBJS-$(HAVE_VIS)                       += sparc/dsputil_vis.o           \
-                                          sparc/simple_idct_vis.o       \
-
-OBJS-$(CONFIG_MLIB)                    += mlib/dsputil_mlib.o           \
+YASM-OBJS-$(CONFIG_GPL)                += x86/h264_deblock_sse2.o       \
+                                          x86/h264_idct_sse2.o          \
+
+MMX-OBJS-$(CONFIG_CAVS_DECODER)        += x86/cavsdsp_mmx.o
+MMX-OBJS-$(CONFIG_ENCODERS)            += x86/dsputilenc_mmx.o
+MMX-OBJS-$(CONFIG_FLAC_ENCODER)        += x86/flacdsp_mmx.o
+MMX-OBJS-$(CONFIG_GPL)                 += x86/idct_mmx.o
+MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp_mmx.o
+MMX-OBJS-$(CONFIG_THEORA_DECODER)      += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
+MMX-OBJS-$(CONFIG_VC1_DECODER)         += x86/vc1dsp_mmx.o
+MMX-OBJS-$(CONFIG_VP3_DECODER)         += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
+MMX-OBJS-$(CONFIG_VP5_DECODER)         += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
+MMX-OBJS-$(CONFIG_VP6_DECODER)         += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
+MMX-OBJS-$(CONFIG_VP6A_DECODER)        += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
+MMX-OBJS-$(CONFIG_VP6F_DECODER)        += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
+MMX-OBJS-$(CONFIG_WMV3_DECODER)        += x86/vc1dsp_mmx.o
+MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
+                                          $(YASM-OBJS-yes)
+
+OBJS-$(HAVE_MMX)                       += x86/cpuid.o                   \
+                                          x86/dnxhd_mmx.o               \
+                                          x86/dsputil_mmx.o             \
+                                          x86/fdct_mmx.o                \
+                                          x86/idct_mmx_xvid.o           \
+                                          x86/idct_sse2_xvid.o          \
+                                          x86/motion_est_mmx.o          \
+                                          x86/mpegvideo_mmx.o           \
+                                          x86/simple_idct_mmx.o         \
+                                          $(MMX-OBJS-yes)
+
+OBJS-$(CONFIG_FFT_MMX)                 += x86/fft_3dn.o                 \
+                                          x86/fft_3dn2.o                \
+                                          x86/fft_mmx.o                 \
+                                          x86/fft_sse.o                 \
 
 OBJS-$(ARCH_ALPHA)                     += alpha/dsputil_alpha.o         \
+                                          alpha/dsputil_alpha_asm.o     \
                                           alpha/motion_est_alpha.o      \
+                                          alpha/motion_est_mvi_asm.o    \
                                           alpha/mpegvideo_alpha.o       \
                                           alpha/simple_idct_alpha.o     \
-                                          alpha/dsputil_alpha_asm.o     \
-                                          alpha/motion_est_mvi_asm.o    \
 
-OBJS-$(ARCH_POWERPC)                   += ppc/dsputil_ppc.o             \
+OBJS-$(ARCH_ARM)                       += arm/dsputil_arm.o             \
+                                          arm/dsputil_arm_s.o           \
+                                          arm/jrevdct_arm.o             \
+                                          arm/mpegvideo_arm.o           \
+                                          arm/simple_idct_arm.o         \
 
-OBJS-$(HAVE_MMI)                       += ps2/dsputil_mmi.o             \
-                                          ps2/idct_mmi.o                \
-                                          ps2/mpegvideo_mmi.o           \
+OBJS-$(HAVE_ARMV5TE)                   += arm/mpegvideo_armv5te.o       \
+                                          arm/mpegvideo_armv5te_s.o     \
+                                          arm/simple_idct_armv5te.o     \
 
-OBJS-$(ARCH_SH4)                       += sh4/idct_sh4.o                \
-                                          sh4/dsputil_align.o           \
-                                          sh4/dsputil_sh4.o             \
+OBJS-$(HAVE_ARMV6)                     += arm/simple_idct_armv6.o       \
+
+OBJS-$(HAVE_ARMVFP)                    += arm/dsputil_vfp.o             \
+                                          arm/float_arm_vfp.o           \
+
+OBJS-$(HAVE_IWMMXT)                    += arm/dsputil_iwmmxt.o          \
+                                          arm/mpegvideo_iwmmxt.o        \
 
-ALTIVEC-OBJS-yes                       += ppc/dsputil_altivec.o         \
+OBJS-$(HAVE_NEON)                      += arm/dsputil_neon.o            \
+                                          arm/dsputil_neon_s.o          \
+                                          arm/h264dsp_neon.o            \
+                                          arm/h264idct_neon.o           \
+                                          arm/simple_idct_neon.o        \
+
+OBJS-$(ARCH_BFIN)                      += bfin/dsputil_bfin.o           \
+                                          bfin/fdct_bfin.o              \
+                                          bfin/idct_bfin.o              \
+                                          bfin/mpegvideo_bfin.o         \
+                                          bfin/pixels_bfin.o            \
+                                          bfin/vp3_bfin.o               \
+                                          bfin/vp3_idct_bfin.o          \
+
+OBJS-$(ARCH_PPC)                       += ppc/dsputil_ppc.o             \
+
+ALTIVEC-OBJS-$(CONFIG_H264_DECODER)    += ppc/h264_altivec.o
+ALTIVEC-OBJS-$(CONFIG_OLDSCALER)       += ppc/imgresample_altivec.o
+ALTIVEC-OBJS-$(CONFIG_SNOW_DECODER)    += ppc/snow_altivec.o
+ALTIVEC-OBJS-$(CONFIG_VC1_DECODER)     += ppc/vc1dsp_altivec.o
+ALTIVEC-OBJS-$(CONFIG_WMV3_DECODER)    += ppc/vc1dsp_altivec.o
+
+OBJS-$(HAVE_ALTIVEC)                   += ppc/check_altivec.o           \
+                                          ppc/dsputil_altivec.o         \
                                           ppc/fdct_altivec.o            \
                                           ppc/fft_altivec.o             \
                                           ppc/float_altivec.o           \
@@ -468,34 +484,30 @@ ALTIVEC-OBJS-yes                       += ppc/dsputil_altivec.o         \
                                           ppc/idct_altivec.o            \
                                           ppc/int_altivec.o             \
                                           ppc/mpegvideo_altivec.o       \
+                                          $(ALTIVEC-OBJS-yes)
 
-ALTIVEC-OBJS-$(CONFIG_H264_DECODER)    += ppc/h264_altivec.o
-ALTIVEC-OBJS-$(CONFIG_SNOW_DECODER)    += ppc/snow_altivec.o
-ALTIVEC-OBJS-$(CONFIG_VC1_DECODER)     += ppc/vc1dsp_altivec.o
-ALTIVEC-OBJS-$(CONFIG_WMV3_DECODER)    += ppc/vc1dsp_altivec.o
+OBJS-$(ARCH_SH4)                       += sh4/dsputil_align.o           \
+                                          sh4/dsputil_sh4.o             \
+                                          sh4/idct_sh4.o                \
 
-# -maltivec is needed in order to build AltiVec code.
-$(addprefix $(SUBDIR),$(ALTIVEC-OBJS-yes)): CFLAGS += -maltivec -mabi=altivec
+OBJS-$(CONFIG_MLIB)                    += mlib/dsputil_mlib.o           \
 
-# check_altivec must be built without -maltivec
-OBJS-$(HAVE_ALTIVEC)                   += $(ALTIVEC-OBJS-yes)           \
-                                          ppc/check_altivec.o
+OBJS-$(HAVE_MMI)                       += ps2/dsputil_mmi.o             \
+                                          ps2/idct_mmi.o                \
+                                          ps2/mpegvideo_mmi.o           \
+
+OBJS-$(HAVE_VIS)                       += sparc/dsputil_vis.o           \
+                                          sparc/simple_idct_vis.o       \
 
-OBJS-$(ARCH_BFIN)                      += bfin/dsputil_bfin.o           \
-                                          bfin/mpegvideo_bfin.o         \
-                                          bfin/vp3_bfin.o               \
-                                          bfin/pixels_bfin.o            \
-                                          bfin/fdct_bfin.o              \
-                                          bfin/idct_bfin.o              \
-                                          bfin/vp3_idct_bfin.o          \
 
-TESTS = $(addsuffix -test$(EXESUF), cabac dct eval fft h264 imgresample rangecoder snow)
-TESTS-$(ARCH_X86) += i386/cpuid-test$(EXESUF) motion-test$(EXESUF)
+TESTS = $(addsuffix -test$(EXESUF), cabac dct eval fft h264 rangecoder snow)
+TESTS-$(CONFIG_OLDSCALER) += imgresample-test$(EXESUF)
+TESTS-$(ARCH_X86) += x86/cpuid-test$(EXESUF) motion-test$(EXESUF)
 
 CLEANFILES = apiexample$(EXESUF)
-DIRS = alpha armv4l bfin i386 mlib ppc ps2 sh4 sparc
+DIRS = alpha arm bfin mlib ppc ps2 sh4 sparc x86
 
 include $(SUBDIR)../subdir.mak
 
-$(SUBDIR)dct-test$(EXESUF): $(SUBDIR)fdctref.o
+$(SUBDIR)dct-test$(EXESUF): $(SUBDIR)fdctref.o $(SUBDIR)aandcttab.o
 $(SUBDIR)fft-test$(EXESUF): $(SUBDIR)fdctref.o
diff --git a/libavcodec/aac.c b/libavcodec/aac.c
index 36742ba..545f125 100644
--- a/libavcodec/aac.c
+++ b/libavcodec/aac.c
@@ -41,7 +41,7 @@
  * N (code in SoC repo) Long Term Prediction
  * Y                    intensity stereo
  * Y                    channel coupling
- * N                    frequency domain prediction
+ * Y                    frequency domain prediction
  * Y                    Perceptual Noise Substitution
  * Y                    Mid/Side stereo
  * N                    Scalable Inverse AAC Quantization
@@ -77,6 +77,7 @@
 
 
 #include "avcodec.h"
+#include "internal.h"
 #include "bitstream.h"
 #include "dsputil.h"
 #include "lpc.h"
@@ -85,6 +86,7 @@
 #include "aactab.h"
 #include "aacdectab.h"
 #include "mpeg4audio.h"
+#include "aac_parser.h"
 
 #include <assert.h>
 #include <errno.h>
@@ -166,15 +168,16 @@ static void decode_channel_map(enum ChannelPosition *cpe_map,
  */
 static int decode_pce(AACContext * ac, enum ChannelPosition new_che_pos[4][MAX_ELEM_ID],
         GetBitContext * gb) {
-    int num_front, num_side, num_back, num_lfe, num_assoc_data, num_cc;
+    int num_front, num_side, num_back, num_lfe, num_assoc_data, num_cc, sampling_index;
 
     skip_bits(gb, 2);  // object_type
 
-    ac->m4ac.sampling_index = get_bits(gb, 4);
-    if(ac->m4ac.sampling_index > 11) {
+    sampling_index = get_bits(gb, 4);
+    if(sampling_index > 11) {
         av_log(ac->avccontext, AV_LOG_ERROR, "invalid sampling rate index %d\n", ac->m4ac.sampling_index);
         return -1;
     }
+    ac->m4ac.sampling_index = sampling_index;
     ac->m4ac.sample_rate = ff_mpeg4audio_sample_rates[ac->m4ac.sampling_index];
     num_front       = get_bits(gb, 4);
     num_side        = get_bits(gb, 4);
@@ -262,7 +265,7 @@ static int decode_ga_specific_config(AACContext * ac, GetBitContext * gb, int ch
     int extension_flag, ret;
 
     if(get_bits1(gb)) {  // frameLengthFlag
-        av_log_missing_feature(ac->avccontext, "960/120 MDCT window is", 1);
+        ff_log_missing_feature(ac->avccontext, "960/120 MDCT window is", 1);
         return -1;
     }
 
@@ -331,6 +334,7 @@ static int decode_audio_specific_config(AACContext * ac, void *data, int data_si
     skip_bits_long(&gb, i);
 
     switch (ac->m4ac.object_type) {
+    case AOT_AAC_MAIN:
     case AOT_AAC_LC:
         if (decode_ga_specific_config(ac, &gb, ac->m4ac.chan_config))
             return -1;
@@ -354,18 +358,51 @@ static av_always_inline int lcg_random(int previous_val) {
     return previous_val * 1664525 + 1013904223;
 }
 
+static void reset_predict_state(PredictorState * ps) {
+    ps->r0 = 0.0f;
+    ps->r1 = 0.0f;
+    ps->cor0 = 0.0f;
+    ps->cor1 = 0.0f;
+    ps->var0 = 1.0f;
+    ps->var1 = 1.0f;
+}
+
+static void reset_all_predictors(PredictorState * ps) {
+    int i;
+    for (i = 0; i < MAX_PREDICTORS; i++)
+        reset_predict_state(&ps[i]);
+}
+
+static void reset_predictor_group(PredictorState * ps, int group_num) {
+    int i;
+    for (i = group_num-1; i < MAX_PREDICTORS; i+=30)
+        reset_predict_state(&ps[i]);
+}
+
 static av_cold int aac_decode_init(AVCodecContext * avccontext) {
     AACContext * ac = avccontext->priv_data;
     int i;
 
     ac->avccontext = avccontext;
 
-    if (avccontext->extradata_size <= 0 ||
-        decode_audio_specific_config(ac, avccontext->extradata, avccontext->extradata_size))
+    if (avccontext->extradata_size > 0) {
+        if(decode_audio_specific_config(ac, avccontext->extradata, avccontext->extradata_size))
+            return -1;
+        avccontext->sample_rate = ac->m4ac.sample_rate;
+    } else if (avccontext->channels > 0) {
+        enum ChannelPosition new_che_pos[4][MAX_ELEM_ID];
+        memset(new_che_pos, 0, 4 * MAX_ELEM_ID * sizeof(new_che_pos[0][0]));
+        if(set_default_channel_config(ac, new_che_pos, avccontext->channels - (avccontext->channels == 8)))
+            return -1;
+        if(output_configure(ac, ac->che_pos, new_che_pos))
+            return -1;
+        ac->m4ac.sample_rate = avccontext->sample_rate;
+    } else {
+        ff_log_missing_feature(ac->avccontext, "Implicit channel configuration is", 0);
         return -1;
+    }
 
     avccontext->sample_fmt  = SAMPLE_FMT_S16;
-    avccontext->sample_rate = ac->m4ac.sample_rate;
     avccontext->frame_size  = 1024;
 
     AAC_INIT_VLC_STATIC( 0, 144);
@@ -432,6 +469,21 @@ static void skip_data_stream_element(GetBitContext * gb) {
     skip_bits_long(gb, 8 * count);
 }
 
+static int decode_prediction(AACContext * ac, IndividualChannelStream * ics, GetBitContext * gb) {
+    int sfb;
+    if (get_bits1(gb)) {
+        ics->predictor_reset_group = get_bits(gb, 5);
+        if (ics->predictor_reset_group == 0 || ics->predictor_reset_group > 30) {
+            av_log(ac->avccontext, AV_LOG_ERROR, "Invalid Predictor Reset Group.\n");
+            return -1;
+        }
+    }
+    for (sfb = 0; sfb < FFMIN(ics->max_sfb, ff_aac_pred_sfb_max[ac->m4ac.sampling_index]); sfb++) {
+        ics->prediction_used[sfb] = get_bits1(gb);
+    }
+    return 0;
+}
+
 /**
  * Decode Individual Channel Stream info; reference: table 4.6.
  *
@@ -464,16 +516,30 @@ static int decode_ics_info(AACContext * ac, IndividualChannelStream * ics, GetBi
         ics->swb_offset    =      swb_offset_128[ac->m4ac.sampling_index];
         ics->num_swb       =  ff_aac_num_swb_128[ac->m4ac.sampling_index];
         ics->tns_max_bands =   tns_max_bands_128[ac->m4ac.sampling_index];
+        ics->predictor_present = 0;
     } else {
         ics->max_sfb       = get_bits(gb, 6);
         ics->num_windows   = 1;
         ics->swb_offset    =     swb_offset_1024[ac->m4ac.sampling_index];
         ics->num_swb       = ff_aac_num_swb_1024[ac->m4ac.sampling_index];
         ics->tns_max_bands =  tns_max_bands_1024[ac->m4ac.sampling_index];
-        if (get_bits1(gb)) {
-            av_log_missing_feature(ac->avccontext, "Predictor bit set but LTP is", 1);
-            memset(ics, 0, sizeof(IndividualChannelStream));
-            return -1;
+        ics->predictor_present = get_bits1(gb);
+        ics->predictor_reset_group = 0;
+        if (ics->predictor_present) {
+            if (ac->m4ac.object_type == AOT_AAC_MAIN) {
+                if (decode_prediction(ac, ics, gb)) {
+                    memset(ics, 0, sizeof(IndividualChannelStream));
+                    return -1;
+                }
+            } else if (ac->m4ac.object_type == AOT_AAC_LC) {
+                av_log(ac->avccontext, AV_LOG_ERROR, "Prediction is not allowed in AAC-LC.\n");
+                memset(ics, 0, sizeof(IndividualChannelStream));
+                return -1;
+            } else {
+                ff_log_missing_feature(ac->avccontext, "Predictor bit set but LTP is", 1);
+                memset(ics, 0, sizeof(IndividualChannelStream));
+                return -1;
+            }
         }
     }
 
@@ -689,6 +755,7 @@ static int decode_spectrum_and_dequant(AACContext * ac, float coef[1024], GetBit
     const int c = 1024/ics->num_windows;
     const uint16_t * offsets = ics->swb_offset;
     float *coef_base = coef;
+    static const float sign_lookup[] = { 1.0f, -1.0f };
 
     for (g = 0; g < ics->num_windows; g++)
         memset(coef + g * 128 + offsets[ics->max_sfb], 0, sizeof(float)*(c - offsets[ics->max_sfb]));
@@ -699,7 +766,7 @@ static int decode_spectrum_and_dequant(AACContext * ac, float coef[1024], GetBit
             const int dim = cur_band_type >= FIRST_PAIR_BT ? 2 : 4;
             const int is_cb_unsigned = IS_CODEBOOK_UNSIGNED(cur_band_type);
             int group;
-            if (cur_band_type == ZERO_BT) {
+            if (cur_band_type == ZERO_BT || cur_band_type == INTENSITY_BT2 || cur_band_type == INTENSITY_BT) {
                 for (group = 0; group < ics->group_len[g]; group++) {
                     memset(coef + group * 128 + offsets[i], 0, (offsets[i+1] - offsets[i])*sizeof(float));
                 }
@@ -717,7 +784,7 @@ static int decode_spectrum_and_dequant(AACContext * ac, float coef[1024], GetBit
                         coef[group*128+k] *= scale;
                     }
                 }
-            }else if (cur_band_type != INTENSITY_BT2 && cur_band_type != INTENSITY_BT) {
+            }else {
                 for (group = 0; group < ics->group_len[g]; group++) {
                     for (k = offsets[i]; k < offsets[i+1]; k += dim) {
                         const int index = get_vlc2(gb, vlc_spectral[cur_band_type - 1].table, 6, 3);
@@ -732,12 +799,19 @@ static int decode_spectrum_and_dequant(AACContext * ac, float coef[1024], GetBit
                         }
                         vq_ptr = &ff_aac_codebook_vectors[cur_band_type - 1][index * dim];
                         if (is_cb_unsigned) {
-                            for (j = 0; j < dim; j++)
-                                if (vq_ptr[j])
-                                    coef[coef_tmp_idx + j] = 1 - 2*(int)get_bits1(gb);
+                            if (vq_ptr[0]) coef[coef_tmp_idx    ] = sign_lookup[get_bits1(gb)];
+                            if (vq_ptr[1]) coef[coef_tmp_idx + 1] = sign_lookup[get_bits1(gb)];
+                            if (dim == 4) {
+                                if (vq_ptr[2]) coef[coef_tmp_idx + 2] = sign_lookup[get_bits1(gb)];
+                                if (vq_ptr[3]) coef[coef_tmp_idx + 3] = sign_lookup[get_bits1(gb)];
+                            }
                         }else {
-                            for (j = 0; j < dim; j++)
-                                coef[coef_tmp_idx + j] = 1.0f;
+                            coef[coef_tmp_idx    ] = 1.0f;
+                            coef[coef_tmp_idx + 1] = 1.0f;
+                            if (dim == 4) {
+                                coef[coef_tmp_idx + 2] = 1.0f;
+                                coef[coef_tmp_idx + 3] = 1.0f;
+                            }
                         }
                         if (cur_band_type == ESC_BT) {
                             for (j = 0; j < 2; j++) {
@@ -751,15 +825,25 @@ static int decode_spectrum_and_dequant(AACContext * ac, float coef[1024], GetBit
                                         return -1;
                                     }
                                     n = (1<<n) + get_bits(gb, n);
-                                    coef[coef_tmp_idx + j] *= cbrtf(fabsf(n)) * n;
+                                    coef[coef_tmp_idx + j] *= cbrtf(n) * n;
                                 }else
                                     coef[coef_tmp_idx + j] *= vq_ptr[j];
                             }
                         }else
-                            for (j = 0; j < dim; j++)
-                                coef[coef_tmp_idx + j] *= vq_ptr[j];
-                        for (j = 0; j < dim; j++)
-                            coef[coef_tmp_idx + j] *= sf[idx];
+                        {
+                            coef[coef_tmp_idx    ] *= vq_ptr[0];
+                            coef[coef_tmp_idx + 1] *= vq_ptr[1];
+                            if (dim == 4) {
+                                coef[coef_tmp_idx + 2] *= vq_ptr[2];
+                                coef[coef_tmp_idx + 3] *= vq_ptr[3];
+                            }
+                        }
+                        coef[coef_tmp_idx    ] *= sf[idx];
+                        coef[coef_tmp_idx + 1] *= sf[idx];
+                        if (dim == 4) {
+                            coef[coef_tmp_idx + 2] *= sf[idx];
+                            coef[coef_tmp_idx + 3] *= sf[idx];
+                        }
                     }
                 }
             }
@@ -786,6 +870,77 @@ static int decode_spectrum_and_dequant(AACContext * ac, float coef[1024], GetBit
     return 0;
 }
 
+static av_always_inline float flt16_round(float pf) {
+    int exp;
+    pf = frexpf(pf, &exp);
+    pf = ldexpf(roundf(ldexpf(pf, 8)), exp-8);
+    return pf;
+}
+
+static av_always_inline float flt16_even(float pf) {
+    int exp;
+    pf = frexpf(pf, &exp);
+    pf = ldexpf(rintf(ldexpf(pf, 8)), exp-8);
+    return pf;
+}
+
+static av_always_inline float flt16_trunc(float pf) {
+    int exp;
+    pf = frexpf(pf, &exp);
+    pf = ldexpf(truncf(ldexpf(pf, 8)), exp-8);
+    return pf;
+}
+
+static void predict(AACContext * ac, PredictorState * ps, float* coef, int output_enable) {
+    const float a     = 0.953125; // 61.0/64
+    const float alpha = 0.90625;  // 29.0/32
+    float e0, e1;
+    float pv;
+    float k1, k2;
+
+    k1 = ps->var0 > 1 ? ps->cor0 * flt16_even(a / ps->var0) : 0;
+    k2 = ps->var1 > 1 ? ps->cor1 * flt16_even(a / ps->var1) : 0;
+
+    pv = flt16_round(k1 * ps->r0 + k2 * ps->r1);
+    if (output_enable)
+        *coef += pv * ac->sf_scale;
+
+    e0 = *coef / ac->sf_scale;
+    e1 = e0 - k1 * ps->r0;
+
+    ps->cor1 = flt16_trunc(alpha * ps->cor1 + ps->r1 * e1);
+    ps->var1 = flt16_trunc(alpha * ps->var1 + 0.5 * (ps->r1 * ps->r1 + e1 * e1));
+    ps->cor0 = flt16_trunc(alpha * ps->cor0 + ps->r0 * e0);
+    ps->var0 = flt16_trunc(alpha * ps->var0 + 0.5 * (ps->r0 * ps->r0 + e0 * e0));
+
+    ps->r1 = flt16_trunc(a * (ps->r0 - k1 * e0));
+    ps->r0 = flt16_trunc(a * e0);
+}
+
+/**
+ * Apply AAC-Main style frequency domain prediction.
+ */
+static void apply_prediction(AACContext * ac, SingleChannelElement * sce) {
+    int sfb, k;
+
+    if (!sce->ics.predictor_initialized) {
+        reset_all_predictors(sce->predictor_state);
+        sce->ics.predictor_initialized = 1;
+    }
+
+    if (sce->ics.window_sequence[0] != EIGHT_SHORT_SEQUENCE) {
+        for (sfb = 0; sfb < ff_aac_pred_sfb_max[ac->m4ac.sampling_index]; sfb++) {
+            for (k = sce->ics.swb_offset[sfb]; k < sce->ics.swb_offset[sfb + 1]; k++) {
+                predict(ac, &sce->predictor_state[k], &sce->coeffs[k],
+                    sce->ics.predictor_present && sce->ics.prediction_used[sfb]);
+            }
+        }
+        if (sce->ics.predictor_reset_group)
+            reset_predictor_group(sce->predictor_state, sce->ics.predictor_reset_group);
+    } else
+        reset_all_predictors(sce->predictor_state);
+}
+
 /**
  * Decode an individual_channel_stream payload; reference: table 4.44.
  *
@@ -833,13 +988,17 @@ static int decode_ics(AACContext * ac, SingleChannelElement * sce, GetBitContext
         if ((tns->present = get_bits1(gb)) && decode_tns(ac, tns, gb, ics))
             return -1;
         if (get_bits1(gb)) {
-            av_log_missing_feature(ac->avccontext, "SSR", 1);
+            ff_log_missing_feature(ac->avccontext, "SSR", 1);
             return -1;
         }
     }
 
     if (decode_spectrum_and_dequant(ac, out, gb, sce->sf, pulse_present, &pulse, ics, sce->band_type) < 0)
         return -1;
+
+    if(ac->m4ac.object_type == AOT_AAC_MAIN && !common_window)
+        apply_prediction(ac, sce);
+
     return 0;
 }
 
@@ -940,8 +1099,14 @@ static int decode_cpe(AACContext * ac, GetBitContext * gb, int elem_id) {
     if ((ret = decode_ics(ac, &cpe->ch[1], gb, common_window, 0)))
         return ret;
 
-    if (common_window && ms_present)
-        apply_mid_side_stereo(cpe);
+    if (common_window) {
+        if (ms_present)
+            apply_mid_side_stereo(cpe);
+        if (ac->m4ac.object_type == AOT_AAC_MAIN) {
+            apply_prediction(ac, &cpe->ch[0]);
+            apply_prediction(ac, &cpe->ch[1]);
+        }
+    }
 
     apply_intensity_stereo(cpe, ms_present);
     return 0;
@@ -1033,7 +1198,7 @@ static int decode_cce(AACContext * ac, GetBitContext * gb, ChannelElement * che)
  */
 static int decode_sbr_extension(AACContext * ac, GetBitContext * gb, int crc, int cnt) {
     // TODO : sbr_extension implementation
-    av_log_missing_feature(ac->avccontext, "SBR", 0);
+    ff_log_missing_feature(ac->avccontext, "SBR", 0);
     skip_bits_long(gb, 8*cnt - 4); // -4 due to reading extension type
     return cnt;
 }
@@ -1191,7 +1356,7 @@ static void imdct_and_windowing(AACContext * ac, SingleChannelElement * sce) {
     const float * lwindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_long_1024 : ff_sine_1024;
     const float * swindow_prev = ics->use_kb_window[1] ? ff_aac_kbd_short_128 : ff_sine_128;
     float * buf = ac->buf_mdct;
-    DECLARE_ALIGNED(16, float, temp[128]);
+    float * temp = ac->temp;
     int i;
 
     // imdct
@@ -1354,6 +1519,29 @@ static void spectral_to_sample(AACContext * ac) {
     }
 }
 
+static int parse_adts_frame_header(AACContext * ac, GetBitContext * gb) {
+
+    int size;
+    AACADTSHeaderInfo hdr_info;
+
+    size = ff_aac_parse_header(gb, &hdr_info);
+    if (size > 0) {
+        if (hdr_info.chan_config)
+            ac->m4ac.chan_config = hdr_info.chan_config;
+        ac->m4ac.sample_rate     = hdr_info.sample_rate;
+        ac->m4ac.sampling_index  = hdr_info.sampling_index;
+        ac->m4ac.object_type     = hdr_info.object_type;
+    }
+    if (hdr_info.num_aac_frames == 1) {
+        if (!hdr_info.crc_absent)
+            skip_bits(gb, 16);
+    } else {
+        ff_log_missing_feature(ac->avccontext, "More than one AAC RDB per ADTS frame is", 0);
+        return -1;
+    }
+    return size;
+}
+
 static int aac_decode_frame(AVCodecContext * avccontext, void * data, int * data_size, const uint8_t * buf, int buf_size) {
     AACContext * ac = avccontext->priv_data;
     GetBitContext gb;
@@ -1362,6 +1550,13 @@ static int aac_decode_frame(AVCodecContext * avccontext, void * data, int * data
 
     init_get_bits(&gb, buf, buf_size*8);
 
+    if (show_bits(&gb, 12) == 0xfff) {
+        if ((err = parse_adts_frame_header(ac, &gb)) < 0) {
+            av_log(avccontext, AV_LOG_ERROR, "Error decoding AAC frame header.\n");
+            return -1;
+        }
+    }
+
     // parse
     while ((elem_type = get_bits(&gb, 3)) != TYPE_END) {
         elem_id = get_bits(&gb, 4);
diff --git a/libavcodec/aac.h b/libavcodec/aac.h
index eec0828..d2f81f2 100644
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -133,6 +133,20 @@ enum CouplingPoint {
 };
 
 /**
+ * Predictor State
+ */
+typedef struct {
+    float cor0;
+    float cor1;
+    float var0;
+    float var1;
+    float r0;
+    float r1;
+} PredictorState;
+
+#define MAX_PREDICTORS 672
+
+/**
  * Individual Channel Stream
  */
 typedef struct {
@@ -145,6 +159,10 @@ typedef struct {
     int num_swb;                ///< number of scalefactor window bands
     int num_windows;
     int tns_max_bands;
+    int predictor_present;
+    int predictor_initialized;
+    int predictor_reset_group;
+    uint8_t prediction_used[41];
 } IndividualChannelStream;
 
 /**
@@ -207,6 +225,7 @@ typedef struct {
     DECLARE_ALIGNED_16(float, coeffs[1024]);  ///< coefficients for IMDCT
     DECLARE_ALIGNED_16(float, saved[512]);    ///< overlap
     DECLARE_ALIGNED_16(float, ret[1024]);     ///< PCM output
+    PredictorState predictor_state[MAX_PREDICTORS];
 } SingleChannelElement;
 
 /**
@@ -269,6 +288,7 @@ typedef struct {
     int sf_offset;                                    ///< offset into pow2sf_tab as appropriate for dsp.float_to_int16
     /** @} */
 
+    DECLARE_ALIGNED(16, float, temp[128]);
 } AACContext;
 
 #endif /* AVCODEC_AAC_H */
diff --git a/libavcodec/aac_ac3_parser.h b/libavcodec/aac_ac3_parser.h
index b2fc0af..b8613d8 100644
--- a/libavcodec/aac_ac3_parser.h
+++ b/libavcodec/aac_ac3_parser.h
@@ -27,6 +27,16 @@
 #include "avcodec.h"
 #include "parser.h"
 
+typedef enum {
+    AAC_AC3_PARSE_ERROR_SYNC        = -1,
+    AAC_AC3_PARSE_ERROR_BSID        = -2,
+    AAC_AC3_PARSE_ERROR_SAMPLE_RATE = -3,
+    AAC_AC3_PARSE_ERROR_FRAME_SIZE  = -4,
+    AAC_AC3_PARSE_ERROR_FRAME_TYPE  = -5,
+    AAC_AC3_PARSE_ERROR_CRC         = -6,
+    AAC_AC3_PARSE_ERROR_CHANNEL_CFG = -7,
+} AACAC3ParseError;
+
 typedef struct AACAC3ParseContext {
     ParseContext pc;
     int frame_size;
diff --git a/libavcodec/aac_parser.c b/libavcodec/aac_parser.c
index 1d75e1e..e38b5ec 100644
--- a/libavcodec/aac_parser.c
+++ b/libavcodec/aac_parser.c
@@ -22,55 +22,80 @@
 
 #include "parser.h"
 #include "aac_ac3_parser.h"
+#include "aac_parser.h"
 #include "bitstream.h"
 #include "mpeg4audio.h"
 
 #define AAC_HEADER_SIZE 7
 
-static int aac_sync(uint64_t state, AACAC3ParseContext *hdr_info,
-        int *need_next_header, int *new_frame_start)
+int ff_aac_parse_header(GetBitContext *gbc, AACADTSHeaderInfo *hdr)
 {
-    GetBitContext bits;
     int size, rdb, ch, sr;
-    uint8_t tmp[8];
+    int aot, crc_abs;
 
-    AV_WB64(tmp, state);
-    init_get_bits(&bits, tmp+8-AAC_HEADER_SIZE, AAC_HEADER_SIZE * 8);
-
-    if(get_bits(&bits, 12) != 0xfff)
-        return 0;
+    if(get_bits(gbc, 12) != 0xfff)
+        return AAC_AC3_PARSE_ERROR_SYNC;
 
-    skip_bits1(&bits);          /* id */
-    skip_bits(&bits, 2);        /* layer */
-    skip_bits1(&bits);          /* protection_absent */
-    skip_bits(&bits, 2);        /* profile_objecttype */
-    sr = get_bits(&bits, 4);    /* sample_frequency_index */
+    skip_bits1(gbc);             /* id */
+    skip_bits(gbc, 2);           /* layer */
+    crc_abs = get_bits1(gbc);    /* protection_absent */
+    aot     = get_bits(gbc, 2);  /* profile_objecttype */
+    sr      = get_bits(gbc, 4);  /* sample_frequency_index */
     if(!ff_mpeg4audio_sample_rates[sr])
-        return 0;
-    skip_bits1(&bits);          /* private_bit */
-    ch = get_bits(&bits, 3);    /* channel_configuration */
+        return AAC_AC3_PARSE_ERROR_SAMPLE_RATE;
+    skip_bits1(gbc);             /* private_bit */
+    ch      = get_bits(gbc, 3);  /* channel_configuration */
+
     if(!ff_mpeg4audio_channels[ch])
-        return 0;
-    skip_bits1(&bits);          /* original/copy */
-    skip_bits1(&bits);          /* home */
+        return AAC_AC3_PARSE_ERROR_CHANNEL_CFG;
+
+    skip_bits1(gbc);             /* original/copy */
+    skip_bits1(gbc);             /* home */
 
     /* adts_variable_header */
-    skip_bits1(&bits);          /* copyright_identification_bit */
-    skip_bits1(&bits);          /* copyright_identification_start */
-    size = get_bits(&bits, 13); /* aac_frame_length */
+    skip_bits1(gbc);             /* copyright_identification_bit */
+    skip_bits1(gbc);             /* copyright_identification_start */
+    size    = get_bits(gbc, 13); /* aac_frame_length */
     if(size < AAC_HEADER_SIZE)
-        return 0;
+        return AAC_AC3_PARSE_ERROR_FRAME_SIZE;
 
-    skip_bits(&bits, 11);       /* adts_buffer_fullness */
-    rdb = get_bits(&bits, 2);   /* number_of_raw_data_blocks_in_frame */
+    skip_bits(gbc, 11);          /* adts_buffer_fullness */
+    rdb = get_bits(gbc, 2);      /* number_of_raw_data_blocks_in_frame */
 
-    hdr_info->channels = ff_mpeg4audio_channels[ch];
-    hdr_info->sample_rate = ff_mpeg4audio_sample_rates[sr];
-    hdr_info->samples = (rdb + 1) * 1024;
-    hdr_info->bit_rate = size * 8 * hdr_info->sample_rate / hdr_info->samples;
+    hdr->object_type    = aot;
+    hdr->chan_config    = ch;
+    hdr->crc_absent     = crc_abs;
+    hdr->num_aac_frames = rdb + 1;
+    hdr->sampling_index = sr;
+    hdr->sample_rate    = ff_mpeg4audio_sample_rates[sr];
+    hdr->samples        = (rdb + 1) * 1024;
+    hdr->bit_rate       = size * 8 * hdr->sample_rate / hdr->samples;
 
+    return size;
+}
+
+static int aac_sync(uint64_t state, AACAC3ParseContext *hdr_info,
+        int *need_next_header, int *new_frame_start)
+{
+    GetBitContext bits;
+    AACADTSHeaderInfo hdr;
+    int size;
+    union {
+        uint64_t u64;
+        uint8_t  u8[8];
+    } tmp;
+
+    tmp.u64 = be2me_64(state);
+    init_get_bits(&bits, tmp.u8+8-AAC_HEADER_SIZE, AAC_HEADER_SIZE * 8);
+
+    if ((size = ff_aac_parse_header(&bits, &hdr)) < 0)
+        return 0;
     *need_next_header = 0;
     *new_frame_start  = 1;
+    hdr_info->sample_rate = hdr.sample_rate;
+    hdr_info->channels    = ff_mpeg4audio_channels[hdr.chan_config];
+    hdr_info->samples     = hdr.samples;
+    hdr_info->bit_rate    = hdr.bit_rate;
     return size;
 }
 
diff --git a/libavcodec/aac_parser.h b/libavcodec/aac_parser.h
new file mode 100644
index 0000000..efc4678
--- /dev/null
+++ b/libavcodec/aac_parser.h
@@ -0,0 +1,53 @@
+/*
+ * AAC parser prototypes
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2003 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AAC_PARSER_H
+#define AVCODEC_AAC_PARSER_H
+
+#include <stdint.h>
+#include "aac_ac3_parser.h"
+#include "bitstream.h"
+
+typedef struct {
+    uint32_t sample_rate;
+    uint32_t samples;
+    uint32_t bit_rate;
+    uint8_t  crc_absent;
+    uint8_t  object_type;
+    uint8_t  sampling_index;
+    uint8_t  chan_config;
+    uint8_t  num_aac_frames;
+} AACADTSHeaderInfo;
+
+/**
+ * Parses AAC frame header.
+ * Parses the ADTS frame header to the end of the variable header, which is
+ * the first 54 bits.
+ * @param gbc[in] BitContext containing the first 54 bits of the frame.
+ * @param hdr[out] Pointer to struct where header info is written.
+ * @return Returns 0 on success, -1 if there is a sync word mismatch,
+ * -2 if the version element is invalid, -3 if the sample rate
+ * element is invalid, or -4 if the bit rate element is invalid.
+ */
+int ff_aac_parse_header(GetBitContext *gbc, AACADTSHeaderInfo *hdr);
+
+#endif /* AVCODEC_AAC_PARSER_H */
diff --git a/libavcodec/aactab.c b/libavcodec/aactab.c
index 0a8b032..3eab636 100644
--- a/libavcodec/aactab.c
+++ b/libavcodec/aactab.c
@@ -43,6 +43,10 @@ const uint8_t ff_aac_num_swb_128[] = {
     12, 12, 12, 14, 14, 14, 15, 15, 15, 15, 15, 15
 };
 
+const uint8_t ff_aac_pred_sfb_max[] = {
+    33, 33, 38, 40, 40, 40, 41, 41, 37, 37, 37, 34
+};
+
 const uint32_t ff_aac_scalefactor_code[121] = {
     0x3ffe8, 0x3ffe6, 0x3ffe7, 0x3ffe5, 0x7fff5, 0x7fff1, 0x7ffed, 0x7fff6,
     0x7ffee, 0x7ffef, 0x7fff0, 0x7fffc, 0x7fffd, 0x7ffff, 0x7fffe, 0x7fff7,
diff --git a/libavcodec/aactab.h b/libavcodec/aactab.h
index f019f7e..07574d8 100644
--- a/libavcodec/aactab.h
+++ b/libavcodec/aactab.h
@@ -54,6 +54,8 @@ extern const uint8_t ff_aac_num_swb_1024[];
 extern const uint8_t ff_aac_num_swb_128 [];
 // @}
 
+extern const uint8_t ff_aac_pred_sfb_max [];
+
 extern const uint32_t ff_aac_scalefactor_code[121];
 extern const uint8_t  ff_aac_scalefactor_bits[121];
 
diff --git a/libavcodec/aandcttab.c b/libavcodec/aandcttab.c
new file mode 100644
index 0000000..4097067
--- /dev/null
+++ b/libavcodec/aandcttab.c
@@ -0,0 +1,47 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file aandcttab.c
+ * AAN (Arai Agui Aakajima) (I)DCT tables
+ */
+
+#include <stdint.h>
+
+const uint16_t ff_aanscales[64] = {
+    /* precomputed values scaled up by 14 bits */
+    16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+    22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
+    21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
+    19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
+    16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+    12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
+    8867 , 12299, 11585, 10426,  8867,  6967,  4799,  2446,
+    4520 ,  6270,  5906,  5315,  4520,  3552,  2446,  1247
+};
+
+const uint16_t ff_inv_aanscales[64] = {
+  4096,  2953,  3135,  3483,  4096,  5213,  7568, 14846,
+  2953,  2129,  2260,  2511,  2953,  3759,  5457, 10703,
+  3135,  2260,  2399,  2666,  3135,  3990,  5793, 11363,
+  3483,  2511,  2666,  2962,  3483,  4433,  6436, 12625,
+  4096,  2953,  3135,  3483,  4096,  5213,  7568, 14846,
+  5213,  3759,  3990,  4433,  5213,  6635,  9633, 18895,
+  7568,  5457,  5793,  6436,  7568,  9633, 13985, 27432,
+ 14846, 10703, 11363, 12625, 14846, 18895, 27432, 53809,
+};
diff --git a/libavcodec/aandcttab.h b/libavcodec/aandcttab.h
new file mode 100644
index 0000000..03bb8b5
--- /dev/null
+++ b/libavcodec/aandcttab.h
@@ -0,0 +1,32 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file aandcttab.h
+ * AAN (Arai Agui Nakajima) (I)DCT tables
+ */
+
+#ifndef AVCODEC_AANDCTTAB_H
+#define AVCODEC_AANDCTTAB_H
+
+#include <stdint.h>
+
+extern const uint16_t ff_aanscales[64];
+extern const uint16_t ff_inv_aanscales[64];
+
+#endif /* AVCODEC_AANDCTTAB_H */
diff --git a/libavcodec/aasc.c b/libavcodec/aasc.c
index 77162f9..fa32231 100644
--- a/libavcodec/aasc.c
+++ b/libavcodec/aasc.c
@@ -62,6 +62,7 @@ static int aasc_decode_frame(AVCodecContext *avctx,
                               const uint8_t *buf, int buf_size)
 {
     AascContext *s = avctx->priv_data;
+    int compr, i, stride;
 
     s->frame.reference = 1;
     s->frame.buffer_hints = FF_BUFFER_HINTS_VALID | FF_BUFFER_HINTS_PRESERVE | FF_BUFFER_HINTS_REUSABLE;
@@ -70,7 +71,24 @@ static int aasc_decode_frame(AVCodecContext *avctx,
         return -1;
     }
 
-    ff_msrle_decode(avctx, &s->frame, 8, buf, buf_size);
+    compr = AV_RL32(buf);
+    buf += 4;
+    buf_size -= 4;
+    switch(compr){
+    case 0:
+        stride = (avctx->width * 3 + 3) & ~3;
+        for(i = avctx->height - 1; i >= 0; i--){
+            memcpy(s->frame.data[0] + i*s->frame.linesize[0], buf, avctx->width*3);
+            buf += stride;
+        }
+        break;
+    case 1:
+        ff_msrle_decode(avctx, &s->frame, 8, buf - 4, buf_size + 4);
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "Unknown compression type %d\n", compr);
+        return -1;
+    }
 
     *data_size = sizeof(AVFrame);
     *(AVFrame*)data = s->frame;
diff --git a/libavcodec/ac3.c b/libavcodec/ac3.c
index e4117f1..cb1a147 100644
--- a/libavcodec/ac3.c
+++ b/libavcodec/ac3.c
@@ -80,11 +80,11 @@ void ff_ac3_bit_alloc_calc_psd(int8_t *exp, int start, int end, int16_t *psd,
     } while (end > band_start_tab[k]);
 }
 
-void ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
-                                int start, int end, int fast_gain, int is_lfe,
-                                int dba_mode, int dba_nsegs, uint8_t *dba_offsets,
-                                uint8_t *dba_lengths, uint8_t *dba_values,
-                                int16_t *mask)
+int ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
+                               int start, int end, int fast_gain, int is_lfe,
+                               int dba_mode, int dba_nsegs, uint8_t *dba_offsets,
+                               uint8_t *dba_lengths, uint8_t *dba_values,
+                               int16_t *mask)
 {
     int16_t excite[50]; /* excitation */
     int bin, k;
@@ -156,9 +156,13 @@ void ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
 
     if (dba_mode == DBA_REUSE || dba_mode == DBA_NEW) {
         int band, seg, delta;
+        if (dba_nsegs >= 8)
+            return -1;
         band = 0;
-        for (seg = 0; seg < FFMIN(8, dba_nsegs); seg++) {
-            band = FFMIN(49, band + dba_offsets[seg]);
+        for (seg = 0; seg < dba_nsegs; seg++) {
+            band += dba_offsets[seg];
+            if (band >= 50 || dba_lengths[seg] > 50-band)
+                return -1;
             if (dba_values[seg] >= 4) {
                 delta = (dba_values[seg] - 3) << 7;
             } else {
@@ -170,6 +174,7 @@ void ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
             }
         }
     }
+    return 0;
 }
 
 void ff_ac3_bit_alloc_calc_bap(int16_t *mask, int16_t *psd, int start, int end,
diff --git a/libavcodec/ac3.h b/libavcodec/ac3.h
index 1f5a711..cbbb718 100644
--- a/libavcodec/ac3.h
+++ b/libavcodec/ac3.h
@@ -149,12 +149,13 @@ void ff_ac3_bit_alloc_calc_psd(int8_t *exp, int start, int end, int16_t *psd,
  * @param[in]  dba_lengths  length of each segment
  * @param[in]  dba_values   delta bit allocation for each segment
  * @param[out] mask         calculated masking curve
+ * @return returns 0 for success, non-zero for error
  */
-void ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
-                                int start, int end, int fast_gain, int is_lfe,
-                                int dba_mode, int dba_nsegs, uint8_t *dba_offsets,
-                                uint8_t *dba_lengths, uint8_t *dba_values,
-                                int16_t *mask);
+int ff_ac3_bit_alloc_calc_mask(AC3BitAllocParameters *s, int16_t *band_psd,
+                               int start, int end, int fast_gain, int is_lfe,
+                               int dba_mode, int dba_nsegs, uint8_t *dba_offsets,
+                               uint8_t *dba_lengths, uint8_t *dba_values,
+                               int16_t *mask);
 
 /**
  * Calculates bit allocation pointers.
diff --git a/libavcodec/ac3_parser.c b/libavcodec/ac3_parser.c
index aedcbcd..f47f97d 100644
--- a/libavcodec/ac3_parser.c
+++ b/libavcodec/ac3_parser.c
@@ -42,12 +42,12 @@ int ff_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
 
     hdr->sync_word = get_bits(gbc, 16);
     if(hdr->sync_word != 0x0B77)
-        return AC3_PARSE_ERROR_SYNC;
+        return AAC_AC3_PARSE_ERROR_SYNC;
 
     /* read ahead to bsid to distinguish between AC-3 and E-AC-3 */
     hdr->bitstream_id = show_bits_long(gbc, 29) & 0x1F;
     if(hdr->bitstream_id > 16)
-        return AC3_PARSE_ERROR_BSID;
+        return AAC_AC3_PARSE_ERROR_BSID;
 
     hdr->num_blocks = 6;
 
@@ -60,11 +60,11 @@ int ff_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
         hdr->crc1 = get_bits(gbc, 16);
         hdr->sr_code = get_bits(gbc, 2);
         if(hdr->sr_code == 3)
-            return AC3_PARSE_ERROR_SAMPLE_RATE;
+            return AAC_AC3_PARSE_ERROR_SAMPLE_RATE;
 
         frame_size_code = get_bits(gbc, 6);
         if(frame_size_code > 37)
-            return AC3_PARSE_ERROR_FRAME_SIZE;
+            return AAC_AC3_PARSE_ERROR_FRAME_SIZE;
 
         skip_bits(gbc, 5); // skip bsid, already got it
 
@@ -93,19 +93,19 @@ int ff_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
         hdr->crc1 = 0;
         hdr->frame_type = get_bits(gbc, 2);
         if(hdr->frame_type == EAC3_FRAME_TYPE_RESERVED)
-            return AC3_PARSE_ERROR_FRAME_TYPE;
+            return AAC_AC3_PARSE_ERROR_FRAME_TYPE;
 
         hdr->substreamid = get_bits(gbc, 3);
 
         hdr->frame_size = (get_bits(gbc, 11) + 1) << 1;
         if(hdr->frame_size < AC3_HEADER_SIZE)
-            return AC3_PARSE_ERROR_FRAME_SIZE;
+            return AAC_AC3_PARSE_ERROR_FRAME_SIZE;
 
         hdr->sr_code = get_bits(gbc, 2);
         if (hdr->sr_code == 3) {
             int sr_code2 = get_bits(gbc, 2);
             if(sr_code2 == 3)
-                return AC3_PARSE_ERROR_SAMPLE_RATE;
+                return AAC_AC3_PARSE_ERROR_SAMPLE_RATE;
             hdr->sample_rate = ff_ac3_sample_rate_tab[sr_code2] / 2;
             hdr->sr_shift = 1;
         } else {
@@ -158,11 +158,14 @@ static int ac3_sync(uint64_t state, AACAC3ParseContext *hdr_info,
         int *need_next_header, int *new_frame_start)
 {
     int err;
-    uint64_t tmp = be2me_64(state);
+    union {
+        uint64_t u64;
+        uint8_t  u8[8];
+    } tmp = { be2me_64(state) };
     AC3HeaderInfo hdr;
     GetBitContext gbc;
 
-    init_get_bits(&gbc, ((uint8_t *)&tmp)+8-AC3_HEADER_SIZE, 54);
+    init_get_bits(&gbc, tmp.u8+8-AC3_HEADER_SIZE, 54);
     err = ff_ac3_parse_header(&gbc, &hdr);
 
     if(err < 0)
diff --git a/libavcodec/ac3_parser.h b/libavcodec/ac3_parser.h
index fc17e87..0f8fc6d 100644
--- a/libavcodec/ac3_parser.h
+++ b/libavcodec/ac3_parser.h
@@ -26,15 +26,6 @@
 #include "ac3.h"
 #include "bitstream.h"
 
-typedef enum {
-    AC3_PARSE_ERROR_SYNC        = -1,
-    AC3_PARSE_ERROR_BSID        = -2,
-    AC3_PARSE_ERROR_SAMPLE_RATE = -3,
-    AC3_PARSE_ERROR_FRAME_SIZE  = -4,
-    AC3_PARSE_ERROR_FRAME_TYPE  = -5,
-    AC3_PARSE_ERROR_CRC         = -6,
-} AC3ParseError;
-
 /**
  * Parses AC-3 frame header.
  * Parses the header up to the lfeon element, which is the first 52 or 54 bits
diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index 3e8b0b5..74c8748 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -35,6 +35,8 @@
 #include <string.h>
 
 #include "libavutil/crc.h"
+#include "internal.h"
+#include "aac_ac3_parser.h"
 #include "ac3_parser.h"
 #include "ac3dec.h"
 #include "ac3dec_data.h"
@@ -372,8 +374,8 @@ static void set_downmix_coeffs(AC3DecodeContext *s)
  * Decode the grouped exponents according to exponent strategy.
  * reference: Section 7.1.3 Exponent Decoding
  */
-static void decode_exponents(GetBitContext *gbc, int exp_strategy, int ngrps,
-                             uint8_t absexp, int8_t *dexps)
+static int decode_exponents(GetBitContext *gbc, int exp_strategy, int ngrps,
+                            uint8_t absexp, int8_t *dexps)
 {
     int i, j, grp, group_size;
     int dexp[256];
@@ -390,12 +392,18 @@ static void decode_exponents(GetBitContext *gbc, int exp_strategy, int ngrps,
 
     /* convert to absolute exps and expand groups */
     prevexp = absexp;
-    for(i=0; i<ngrps*3; i++) {
-        prevexp = av_clip(prevexp + dexp[i]-2, 0, 24);
-        for(j=0; j<group_size; j++) {
-            dexps[(i*group_size)+j] = prevexp;
+    for(i=0,j=0; i<ngrps*3; i++) {
+        prevexp += dexp[i] - 2;
+        if (prevexp > 24U)
+            return -1;
+        switch (group_size) {
+            case 4: dexps[j++] = prevexp;
+                    dexps[j++] = prevexp;
+            case 2: dexps[j++] = prevexp;
+            case 1: dexps[j++] = prevexp;
         }
     }
+    return 0;
 }
 
 /**
@@ -728,9 +736,10 @@ static void decode_band_structure(GetBitContext *gbc, int blk, int eac3,
                                   int ecpl, int start_subband, int end_subband,
                                   const uint8_t *default_band_struct,
                                   uint8_t *band_struct, int *num_subbands,
-                                  int *num_bands, int *band_sizes)
+                                  int *num_bands, uint8_t *band_sizes)
 {
-    int subbnd, bnd, n_subbands, n_bands, bnd_sz[22];
+    int subbnd, bnd, n_subbands, n_bands=0;
+    uint8_t bnd_sz[22];
 
     n_subbands = end_subband - start_subband;
 
@@ -769,7 +778,7 @@ static void decode_band_structure(GetBitContext *gbc, int blk, int eac3,
     if (num_bands)
         *num_bands = n_bands;
     if (band_sizes)
-        memcpy(band_sizes, bnd_sz, sizeof(int)*n_bands);
+        memcpy(band_sizes, bnd_sz, n_bands);
 }
 
 /**
@@ -819,7 +828,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
     /* spectral extension strategy */
     if (s->eac3 && (!blk || get_bits1(gbc))) {
         if (get_bits1(gbc)) {
-            av_log_missing_feature(s->avctx, "Spectral extension", 1);
+            ff_log_missing_feature(s->avctx, "Spectral extension", 1);
             return -1;
         }
         /* TODO: parse spectral extension strategy info */
@@ -844,7 +853,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
             /* check for enhanced coupling */
             if (s->eac3 && get_bits1(gbc)) {
                 /* TODO: parse enhanced coupling strategy info */
-                av_log_missing_feature(s->avctx, "Enhanced coupling", 1);
+                ff_log_missing_feature(s->avctx, "Enhanced coupling", 1);
                 return -1;
             }
 
@@ -988,9 +997,12 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
     for (ch = !cpl_in_use; ch <= s->channels; ch++) {
         if (s->exp_strategy[blk][ch] != EXP_REUSE) {
             s->dexps[ch][0] = get_bits(gbc, 4) << !ch;
-            decode_exponents(gbc, s->exp_strategy[blk][ch],
-                             s->num_exp_groups[ch], s->dexps[ch][0],
-                             &s->dexps[ch][s->start_freq[ch]+!!ch]);
+            if (decode_exponents(gbc, s->exp_strategy[blk][ch],
+                                 s->num_exp_groups[ch], s->dexps[ch][0],
+                                 &s->dexps[ch][s->start_freq[ch]+!!ch])) {
+                av_log(s->avctx, AV_LOG_ERROR, "exponent out-of-range\n");
+                return -1;
+            }
             if(ch != CPL_CH && ch != s->lfe_ch)
                 skip_bits(gbc, 2); /* skip gainrng */
         }
@@ -1123,12 +1135,15 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
         if(bit_alloc_stages[ch] > 1) {
             /* Compute excitation function, Compute masking curve, and
                Apply delta bit allocation */
-            ff_ac3_bit_alloc_calc_mask(&s->bit_alloc_params, s->band_psd[ch],
-                                       s->start_freq[ch], s->end_freq[ch],
-                                       s->fast_gain[ch], (ch == s->lfe_ch),
-                                       s->dba_mode[ch], s->dba_nsegs[ch],
-                                       s->dba_offsets[ch], s->dba_lengths[ch],
-                                       s->dba_values[ch], s->mask[ch]);
+            if (ff_ac3_bit_alloc_calc_mask(&s->bit_alloc_params, s->band_psd[ch],
+                                           s->start_freq[ch], s->end_freq[ch],
+                                           s->fast_gain[ch], (ch == s->lfe_ch),
+                                           s->dba_mode[ch], s->dba_nsegs[ch],
+                                           s->dba_offsets[ch], s->dba_lengths[ch],
+                                           s->dba_values[ch], s->mask[ch])) {
+                av_log(s->avctx, AV_LOG_ERROR, "error in bit allocation\n");
+                return -1;
+            }
         }
         if(bit_alloc_stages[ch] > 0) {
             /* Compute bit allocation */
@@ -1234,32 +1249,32 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
     /* check that reported frame size fits in input buffer */
     if(s->frame_size > buf_size) {
         av_log(avctx, AV_LOG_ERROR, "incomplete frame\n");
-        err = AC3_PARSE_ERROR_FRAME_SIZE;
+        err = AAC_AC3_PARSE_ERROR_FRAME_SIZE;
     }
 
     /* check for crc mismatch */
-    if(err != AC3_PARSE_ERROR_FRAME_SIZE && avctx->error_recognition >= FF_ER_CAREFUL) {
+    if(err != AAC_AC3_PARSE_ERROR_FRAME_SIZE && avctx->error_recognition >= FF_ER_CAREFUL) {
         if(av_crc(av_crc_get_table(AV_CRC_16_ANSI), 0, &buf[2], s->frame_size-2)) {
             av_log(avctx, AV_LOG_ERROR, "frame CRC mismatch\n");
-            err = AC3_PARSE_ERROR_CRC;
+            err = AAC_AC3_PARSE_ERROR_CRC;
         }
     }
 
-    if(err && err != AC3_PARSE_ERROR_CRC) {
+    if(err && err != AAC_AC3_PARSE_ERROR_CRC) {
         switch(err) {
-            case AC3_PARSE_ERROR_SYNC:
+            case AAC_AC3_PARSE_ERROR_SYNC:
                 av_log(avctx, AV_LOG_ERROR, "frame sync error\n");
                 return -1;
-            case AC3_PARSE_ERROR_BSID:
+            case AAC_AC3_PARSE_ERROR_BSID:
                 av_log(avctx, AV_LOG_ERROR, "invalid bitstream id\n");
                 break;
-            case AC3_PARSE_ERROR_SAMPLE_RATE:
+            case AAC_AC3_PARSE_ERROR_SAMPLE_RATE:
                 av_log(avctx, AV_LOG_ERROR, "invalid sample rate\n");
                 break;
-            case AC3_PARSE_ERROR_FRAME_SIZE:
+            case AAC_AC3_PARSE_ERROR_FRAME_SIZE:
                 av_log(avctx, AV_LOG_ERROR, "invalid frame size\n");
                 break;
-            case AC3_PARSE_ERROR_FRAME_TYPE:
+            case AAC_AC3_PARSE_ERROR_FRAME_TYPE:
                 /* skip frame if CRC is ok. otherwise use error concealment. */
                 /* TODO: add support for substreams and dependent frames */
                 if(s->frame_type == EAC3_FRAME_TYPE_DEPENDENT || s->substreamid) {
@@ -1308,6 +1323,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
         const float *output[s->out_channels];
         if (!err && decode_audio_block(s, blk)) {
             av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
+            err = 1;
         }
         for (ch = 0; ch < s->out_channels; ch++)
             output[ch] = s->output[ch];
diff --git a/libavcodec/acelp_pitch_delay.c b/libavcodec/acelp_pitch_delay.c
index 3db5ea3..ac929c4 100644
--- a/libavcodec/acelp_pitch_delay.c
+++ b/libavcodec/acelp_pitch_delay.c
@@ -21,6 +21,7 @@
  */
 
 #include "avcodec.h"
+#include "dsputil.h"
 #include "acelp_pitch_delay.h"
 #include "celp_math.h"
 
@@ -87,6 +88,7 @@ void ff_acelp_update_past_gain(
 }
 
 int16_t ff_acelp_decode_gain_code(
+    DSPContext *dsp,
     int gain_corr_factor,
     const int16_t* fc_v,
     int mr_energy,
@@ -103,7 +105,7 @@ int16_t ff_acelp_decode_gain_code(
         mr_energy += quant_energy[i] * ma_prediction_coeff[i];
 
 #ifdef G729_BITEXACT
-    mr_energy += (((-6165LL * ff_log2(dot_product(fc_v, fc_v, subframe_size, 0))) >> 3) & ~0x3ff);
+    mr_energy += (((-6165LL * ff_log2(dsp->scalarproduct_int16(fc_v, fc_v, subframe_size, 0))) >> 3) & ~0x3ff);
 
     mr_energy = (5439 * (mr_energy >> 15)) >> 8;           // (0.15) = (0.15) * (7.23)
 
@@ -113,7 +115,7 @@ int16_t ff_acelp_decode_gain_code(
            );
 #else
     mr_energy = gain_corr_factor * exp(M_LN10 / (20 << 23) * mr_energy) /
-                sqrt(dot_product(fc_v, fc_v, subframe_size, 0));
+                sqrt(dsp->scalarproduct_int16(fc_v, fc_v, subframe_size, 0));
     return mr_energy >> 12;
 #endif
 }
diff --git a/libavcodec/acelp_pitch_delay.h b/libavcodec/acelp_pitch_delay.h
index be5ac09..2504a9e 100644
--- a/libavcodec/acelp_pitch_delay.h
+++ b/libavcodec/acelp_pitch_delay.h
@@ -24,6 +24,7 @@
 #define AVCODEC_ACELP_PITCH_DELAY_H
 
 #include <stdint.h>
+#include "dsputil.h"
 
 #define PITCH_DELAY_MIN             20
 #define PITCH_DELAY_MAX             143
@@ -140,6 +141,7 @@ void ff_acelp_update_past_gain(
 /**
  * \brief Decode the adaptive codebook gain and add
  *        correction (4.1.5 and 3.9.1 of G.729).
+ * \param dsp initialized dsputil context
  * \param gain_corr_factor gain correction factor (2.13)
  * \param fc_v fixed-codebook vector (2.13)
  * \param mr_energy mean innovation energy and fixed-point correction (7.13)
@@ -209,6 +211,7 @@ void ff_acelp_update_past_gain(
  * \remark The routine is used in G.729 and AMR (all modes).
  */
 int16_t ff_acelp_decode_gain_code(
+    DSPContext *dsp,
     int gain_corr_factor,
     const int16_t* fc_v,
     int mr_energy,
diff --git a/libavcodec/alacenc.c b/libavcodec/alacenc.c
index df4136f..9fd5064 100644
--- a/libavcodec/alacenc.c
+++ b/libavcodec/alacenc.c
@@ -253,7 +253,8 @@ static void alac_linear_predictor(AlacEncodeContext *s, int ch)
 
             sum >>= lpc.lpc_quant;
             sum += samples[0];
-            residual[i] = samples[lpc.lpc_order+1] - sum;
+            residual[i] = (samples[lpc.lpc_order+1] - sum) << (32 - s->write_sample_size) >>
+                          (32 - s->write_sample_size);
             res_val = residual[i];
 
             if(res_val) {
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 7d958ee..2d6ce40 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -41,16 +41,6 @@
           extern AVBitStreamFilter x##_bsf; \
           if(ENABLE_##X##_BSF)     av_register_bitstream_filter(&x##_bsf); }
 
-/**
- * Register all the codecs, parsers and bitstream filters which were enabled at
- * configuration time. If you do not call this function you can select exactly
- * which formats you want to support, by using the individual registration
- * functions.
- *
- * @see register_avcodec
- * @see av_register_codec_parser
- * @see av_register_bitstream_filter
- */
 void avcodec_register_all(void)
 {
     static int initialized;
@@ -97,6 +87,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER  (H263, h263);
     REGISTER_DECODER (H263I, h263i);
     REGISTER_DECODER (H264, h264);
+    REGISTER_DECODER (H264_VDPAU, h264_vdpau);
     REGISTER_ENCDEC  (HUFFYUV, huffyuv);
     REGISTER_DECODER (IDCIN, idcin);
     REGISTER_DECODER (INDEO2, indeo2);
@@ -141,6 +132,8 @@ void avcodec_register_all(void)
     REGISTER_DECODER (RPZA, rpza);
     REGISTER_ENCDEC  (RV10, rv10);
     REGISTER_ENCDEC  (RV20, rv20);
+    REGISTER_DECODER (RV30, rv30);
+    REGISTER_DECODER (RV40, rv40);
     REGISTER_ENCDEC  (SGI, sgi);
     REGISTER_DECODER (SMACKER, smacker);
     REGISTER_DECODER (SMC, smc);
@@ -202,6 +195,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER (MPC7, mpc7);
     REGISTER_DECODER (MPC8, mpc8);
     REGISTER_ENCDEC  (NELLYMOSER, nellymoser);
+    REGISTER_DECODER (QCELP, qcelp);
     REGISTER_DECODER (QDM2, qdm2);
     REGISTER_DECODER (RA_144, ra_144);
     REGISTER_DECODER (RA_288, ra_288);
diff --git a/libavcodec/alpha/dsputil_alpha.c b/libavcodec/alpha/dsputil_alpha.c
index 6b80731..c1500b1 100644
--- a/libavcodec/alpha/dsputil_alpha.c
+++ b/libavcodec/alpha/dsputil_alpha.c
@@ -22,9 +22,9 @@
 #include "libavcodec/dsputil.h"
 #include "asm.h"
 
-extern void simple_idct_axp(DCTELEM *block);
-extern void simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block);
-extern void simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block);
+void simple_idct_axp(DCTELEM *block);
+void simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block);
+void simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block);
 
 void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
                         int line_size, int h);
diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index 82d567f..8746724 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -199,6 +199,7 @@ static av_cold int ape_decode_init(AVCodecContext * avctx)
 
     dsputil_init(&s->dsp, avctx);
     avctx->sample_fmt = SAMPLE_FMT_S16;
+    avctx->channel_layout = (avctx->channels==2) ? CH_LAYOUT_STEREO : CH_LAYOUT_MONO;
     return 0;
 }
 
diff --git a/libavcodec/arm/asm.S b/libavcodec/arm/asm.S
new file mode 100644
index 0000000..0aa1639
--- /dev/null
+++ b/libavcodec/arm/asm.S
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+        .macro require8, val=1
+        .eabi_attribute 24, \val
+        .endm
+
+        .macro preserve8, val=1
+        .eabi_attribute 25, \val
+        .endm
+
+        .macro function name, export=0
+.if \export
+        .global \name
+.endif
+        .type   \name, %function
+        .func   \name
+\name:
+        .endm
+
+        .macro movrel rd, val
+#if defined(HAVE_ARMV6T2) && !defined(CONFIG_SHARED)
+        movw            \rd, #:lower16:\val
+        movt            \rd, #:upper16:\val
+#else
+        ldr             \rd, =\val
+#endif
+        .endm
diff --git a/libavcodec/arm/dsputil_arm.c b/libavcodec/arm/dsputil_arm.c
new file mode 100644
index 0000000..eaa6b9e
--- /dev/null
+++ b/libavcodec/arm/dsputil_arm.c
@@ -0,0 +1,217 @@
+/*
+ * ARM optimized DSP utils
+ * Copyright (c) 2001 Lionel Ulmer.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+#ifdef HAVE_IPP
+#include <ipp.h>
+#endif
+
+void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
+void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx);
+void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
+
+void j_rev_dct_ARM(DCTELEM *data);
+void simple_idct_ARM(DCTELEM *data);
+
+void simple_idct_armv5te(DCTELEM *data);
+void simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
+void simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
+
+void ff_simple_idct_armv6(DCTELEM *data);
+void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data);
+
+void ff_simple_idct_neon(DCTELEM *data);
+void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
+
+/* XXX: local hack */
+static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
+static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
+
+void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void ff_prefetch_arm(void *mem, int stride, int h);
+
+CALL_2X_PIXELS(put_pixels16_x2_arm , put_pixels8_x2_arm , 8)
+CALL_2X_PIXELS(put_pixels16_y2_arm , put_pixels8_y2_arm , 8)
+CALL_2X_PIXELS(put_pixels16_xy2_arm, put_pixels8_xy2_arm, 8)
+CALL_2X_PIXELS(put_no_rnd_pixels16_x2_arm , put_no_rnd_pixels8_x2_arm , 8)
+CALL_2X_PIXELS(put_no_rnd_pixels16_y2_arm , put_no_rnd_pixels8_y2_arm , 8)
+CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_arm, put_no_rnd_pixels8_xy2_arm, 8)
+
+void ff_add_pixels_clamped_ARM(short *block, unsigned char *dest,
+                                      int line_size);
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+   converted */
+static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    j_rev_dct_ARM (block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    j_rev_dct_ARM (block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    simple_idct_ARM (block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    simple_idct_ARM (block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+
+#ifdef HAVE_IPP
+static void simple_idct_ipp(DCTELEM *block)
+{
+    ippiDCT8x8Inv_Video_16s_C1I(block);
+}
+static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
+}
+
+void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
+
+static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ippiDCT8x8Inv_Video_16s_C1I(block);
+#ifdef HAVE_IWMMXT
+    add_pixels_clamped_iwmmxt(block, dest, line_size);
+#else
+    ff_add_pixels_clamped_ARM(block, dest, line_size);
+#endif
+}
+#endif
+
+int mm_support(void)
+{
+    return ENABLE_IWMMXT * FF_MM_IWMMXT;
+}
+
+void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)
+{
+    int idct_algo= avctx->idct_algo;
+
+    ff_put_pixels_clamped = c->put_pixels_clamped;
+    ff_add_pixels_clamped = c->add_pixels_clamped;
+
+    if (avctx->lowres == 0) {
+        if(idct_algo == FF_IDCT_AUTO){
+#if defined(HAVE_IPP)
+            idct_algo = FF_IDCT_IPP;
+#elif defined(HAVE_NEON)
+            idct_algo = FF_IDCT_SIMPLENEON;
+#elif defined(HAVE_ARMV6)
+            idct_algo = FF_IDCT_SIMPLEARMV6;
+#elif defined(HAVE_ARMV5TE)
+            idct_algo = FF_IDCT_SIMPLEARMV5TE;
+#else
+            idct_algo = FF_IDCT_ARM;
+#endif
+        }
+
+        if(idct_algo==FF_IDCT_ARM){
+            c->idct_put= j_rev_dct_ARM_put;
+            c->idct_add= j_rev_dct_ARM_add;
+            c->idct    = j_rev_dct_ARM;
+            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
+        } else if (idct_algo==FF_IDCT_SIMPLEARM){
+            c->idct_put= simple_idct_ARM_put;
+            c->idct_add= simple_idct_ARM_add;
+            c->idct    = simple_idct_ARM;
+            c->idct_permutation_type= FF_NO_IDCT_PERM;
+#ifdef HAVE_ARMV6
+        } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
+            c->idct_put= ff_simple_idct_put_armv6;
+            c->idct_add= ff_simple_idct_add_armv6;
+            c->idct    = ff_simple_idct_armv6;
+            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
+#endif
+#ifdef HAVE_ARMV5TE
+        } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
+            c->idct_put= simple_idct_put_armv5te;
+            c->idct_add= simple_idct_add_armv5te;
+            c->idct    = simple_idct_armv5te;
+            c->idct_permutation_type = FF_NO_IDCT_PERM;
+#endif
+#ifdef HAVE_IPP
+        } else if (idct_algo==FF_IDCT_IPP){
+            c->idct_put= simple_idct_ipp_put;
+            c->idct_add= simple_idct_ipp_add;
+            c->idct    = simple_idct_ipp;
+            c->idct_permutation_type= FF_NO_IDCT_PERM;
+#endif
+#ifdef HAVE_NEON
+        } else if (idct_algo==FF_IDCT_SIMPLENEON){
+            c->idct_put= ff_simple_idct_put_neon;
+            c->idct_add= ff_simple_idct_add_neon;
+            c->idct    = ff_simple_idct_neon;
+            c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
+#endif
+        }
+    }
+
+    c->put_pixels_tab[0][0] = put_pixels16_arm;
+    c->put_pixels_tab[0][1] = put_pixels16_x2_arm;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_arm;
+    c->put_pixels_tab[0][3] = put_pixels16_xy2_arm;
+    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm;
+    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm;
+    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm;
+    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm;
+    c->put_pixels_tab[1][0] = put_pixels8_arm;
+    c->put_pixels_tab[1][1] = put_pixels8_x2_arm;
+    c->put_pixels_tab[1][2] = put_pixels8_y2_arm;
+    c->put_pixels_tab[1][3] = put_pixels8_xy2_arm;
+    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;
+    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm;
+    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm;
+    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;
+
+#ifdef HAVE_ARMV5TE
+    c->prefetch = ff_prefetch_arm;
+#endif
+
+#ifdef HAVE_IWMMXT
+    dsputil_init_iwmmxt(c, avctx);
+#endif
+#ifdef HAVE_ARMVFP
+    ff_float_init_arm_vfp(c, avctx);
+#endif
+#ifdef HAVE_NEON
+    ff_dsputil_init_neon(c, avctx);
+#endif
+}
diff --git a/libavcodec/arm/dsputil_arm_s.S b/libavcodec/arm/dsputil_arm_s.S
new file mode 100644
index 0000000..639b7b8
--- /dev/null
+++ b/libavcodec/arm/dsputil_arm_s.S
@@ -0,0 +1,799 @@
+@
+@ ARMv4 optimized DSP utils
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
+@
+@ This file is part of FFmpeg.
+@
+@ FFmpeg is free software; you can redistribute it and/or
+@ modify it under the terms of the GNU Lesser General Public
+@ License as published by the Free Software Foundation; either
+@ version 2.1 of the License, or (at your option) any later version.
+@
+@ FFmpeg is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+@ Lesser General Public License for more details.
+@
+@ You should have received a copy of the GNU Lesser General Public
+@ License along with FFmpeg; if not, write to the Free Software
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+@
+
+#include "config.h"
+#include "asm.S"
+
+        preserve8
+
+#ifndef HAVE_PLD
+.macro pld reg
+.endm
+#endif
+
+#ifdef HAVE_ARMV5TE
+function ff_prefetch_arm, export=1
+        subs    r2, r2, #1
+        pld     [r0]
+        add     r0, r0, r1
+        bne     ff_prefetch_arm
+        bx      lr
+        .endfunc
+#endif
+
+.macro  ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
+        mov \Rd0, \Rn0, lsr #(\shift * 8)
+        mov \Rd1, \Rn1, lsr #(\shift * 8)
+        mov \Rd2, \Rn2, lsr #(\shift * 8)
+        mov \Rd3, \Rn3, lsr #(\shift * 8)
+        orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
+        orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
+        orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
+        orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
+.endm
+.macro  ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
+        mov \R0, \R0, lsr #(\shift * 8)
+        orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
+        mov \R1, \R1, lsr #(\shift * 8)
+        orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
+.endm
+.macro  ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
+        mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
+        mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
+        orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
+        orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
+.endm
+
+.macro  RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
+        @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
+        @ Rmask = 0xFEFEFEFE
+        @ Rn = destroy
+        eor \Rd0, \Rn0, \Rm0
+        eor \Rd1, \Rn1, \Rm1
+        orr \Rn0, \Rn0, \Rm0
+        orr \Rn1, \Rn1, \Rm1
+        and \Rd0, \Rd0, \Rmask
+        and \Rd1, \Rd1, \Rmask
+        sub \Rd0, \Rn0, \Rd0, lsr #1
+        sub \Rd1, \Rn1, \Rd1, lsr #1
+.endm
+
+.macro  NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
+        @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
+        @ Rmask = 0xFEFEFEFE
+        @ Rn = destroy
+        eor \Rd0, \Rn0, \Rm0
+        eor \Rd1, \Rn1, \Rm1
+        and \Rn0, \Rn0, \Rm0
+        and \Rn1, \Rn1, \Rm1
+        and \Rd0, \Rd0, \Rmask
+        and \Rd1, \Rd1, \Rmask
+        add \Rd0, \Rn0, \Rd0, lsr #1
+        add \Rd1, \Rn1, \Rd1, lsr #1
+.endm
+
+@ ----------------------------------------------------------------
+        .align 8
+function put_pixels16_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld [r1]
+        stmfd sp!, {r4-r11, lr} @ R14 is also called LR
+        adr r5, 5f
+        ands r4, r1, #3
+        bic r1, r1, #3
+        add r5, r5, r4, lsl #2
+        ldrne pc, [r5]
+1:
+        ldmia r1, {r4-r7}
+        add r1, r1, r2
+        stmia r0, {r4-r7}
+        pld [r1]
+        subs r3, r3, #1
+        add r0, r0, r2
+        bne 1b
+        ldmfd sp!, {r4-r11, pc}
+        .align 8
+2:
+        ldmia r1, {r4-r8}
+        add r1, r1, r2
+        ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
+        pld [r1]
+        subs r3, r3, #1
+        stmia r0, {r9-r12}
+        add r0, r0, r2
+        bne 2b
+        ldmfd sp!, {r4-r11, pc}
+        .align 8
+3:
+        ldmia r1, {r4-r8}
+        add r1, r1, r2
+        ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
+        pld [r1]
+        subs r3, r3, #1
+        stmia r0, {r9-r12}
+        add r0, r0, r2
+        bne 3b
+        ldmfd sp!, {r4-r11, pc}
+        .align 8
+4:
+        ldmia r1, {r4-r8}
+        add r1, r1, r2
+        ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
+        pld [r1]
+        subs r3, r3, #1
+        stmia r0, {r9-r12}
+        add r0, r0, r2
+        bne 4b
+        ldmfd sp!, {r4-r11,pc}
+        .align 8
+5:
+        .word 1b
+        .word 2b
+        .word 3b
+        .word 4b
+        .endfunc
+
+@ ----------------------------------------------------------------
+        .align 8
+function put_pixels8_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld [r1]
+        stmfd sp!, {r4-r5,lr} @ R14 is also called LR
+        adr r5, 5f
+        ands r4, r1, #3
+        bic r1, r1, #3
+        add r5, r5, r4, lsl #2
+        ldrne pc, [r5]
+1:
+        ldmia r1, {r4-r5}
+        add r1, r1, r2
+        subs r3, r3, #1
+        pld [r1]
+        stmia r0, {r4-r5}
+        add r0, r0, r2
+        bne 1b
+        ldmfd sp!, {r4-r5,pc}
+        .align 8
+2:
+        ldmia r1, {r4-r5, r12}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
+        pld [r1]
+        subs r3, r3, #1
+        stmia r0, {r4-r5}
+        add r0, r0, r2
+        bne 2b
+        ldmfd sp!, {r4-r5,pc}
+        .align 8
+3:
+        ldmia r1, {r4-r5, r12}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
+        pld [r1]
+        subs r3, r3, #1
+        stmia r0, {r4-r5}
+        add r0, r0, r2
+        bne 3b
+        ldmfd sp!, {r4-r5,pc}
+        .align 8
+4:
+        ldmia r1, {r4-r5, r12}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
+        pld [r1]
+        subs r3, r3, #1
+        stmia r0, {r4-r5}
+        add r0, r0, r2
+        bne 4b
+        ldmfd sp!, {r4-r5,pc}
+        .align 8
+5:
+        .word 1b
+        .word 2b
+        .word 3b
+        .word 4b
+        .endfunc
+
+@ ----------------------------------------------------------------
+        .align 8
+function put_pixels8_x2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld [r1]
+        stmfd sp!, {r4-r10,lr} @ R14 is also called LR
+        adr r5, 5f
+        ands r4, r1, #3
+        ldr r12, [r5]
+        add r5, r5, r4, lsl #2
+        bic r1, r1, #3
+        ldrne pc, [r5]
+1:
+        ldmia r1, {r4-r5, r10}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+        pld [r1]
+        RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+        subs r3, r3, #1
+        stmia r0, {r8-r9}
+        add r0, r0, r2
+        bne 1b
+        ldmfd sp!, {r4-r10,pc}
+        .align 8
+2:
+        ldmia r1, {r4-r5, r10}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+        ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
+        pld [r1]
+        RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+        subs r3, r3, #1
+        stmia r0, {r4-r5}
+        add r0, r0, r2
+        bne 2b
+        ldmfd sp!, {r4-r10,pc}
+        .align 8
+3:
+        ldmia r1, {r4-r5, r10}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
+        ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
+        pld [r1]
+        RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+        subs r3, r3, #1
+        stmia r0, {r4-r5}
+        add r0, r0, r2
+        bne 3b
+        ldmfd sp!, {r4-r10,pc}
+        .align 8
+4:
+        ldmia r1, {r4-r5, r10}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
+        pld [r1]
+        RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+        subs r3, r3, #1
+        stmia r0, {r8-r9}
+        add r0, r0, r2
+        bne 4b
+        ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
+        .align 8
+5:
+        .word 0xFEFEFEFE
+        .word 2b
+        .word 3b
+        .word 4b
+        .endfunc
+
+        .align 8
+function put_no_rnd_pixels8_x2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld [r1]
+        stmfd sp!, {r4-r10,lr} @ R14 is also called LR
+        adr r5, 5f
+        ands r4, r1, #3
+        ldr r12, [r5]
+        add r5, r5, r4, lsl #2
+        bic r1, r1, #3
+        ldrne pc, [r5]
+1:
+        ldmia r1, {r4-r5, r10}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+        pld [r1]
+        NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+        subs r3, r3, #1
+        stmia r0, {r8-r9}
+        add r0, r0, r2
+        bne 1b
+        ldmfd sp!, {r4-r10,pc}
+        .align 8
+2:
+        ldmia r1, {r4-r5, r10}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+        ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
+        pld [r1]
+        NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+        subs r3, r3, #1
+        stmia r0, {r4-r5}
+        add r0, r0, r2
+        bne 2b
+        ldmfd sp!, {r4-r10,pc}
+        .align 8
+3:
+        ldmia r1, {r4-r5, r10}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
+        ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
+        pld [r1]
+        NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+        subs r3, r3, #1
+        stmia r0, {r4-r5}
+        add r0, r0, r2
+        bne 3b
+        ldmfd sp!, {r4-r10,pc}
+        .align 8
+4:
+        ldmia r1, {r4-r5, r10}
+        add r1, r1, r2
+        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
+        pld [r1]
+        NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+        subs r3, r3, #1
+        stmia r0, {r8-r9}
+        add r0, r0, r2
+        bne 4b
+        ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
+        .align 8
+5:
+        .word 0xFEFEFEFE
+        .word 2b
+        .word 3b
+        .word 4b
+        .endfunc
+
+
+@ ----------------------------------------------------------------
+        .align 8
+function put_pixels8_y2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld [r1]
+        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+        adr r5, 5f
+        ands r4, r1, #3
+        mov r3, r3, lsr #1
+        ldr r12, [r5]
+        add r5, r5, r4, lsl #2
+        bic r1, r1, #3
+        ldrne pc, [r5]
+1:
+        ldmia r1, {r4-r5}
+        add r1, r1, r2
+6:      ldmia r1, {r6-r7}
+        add r1, r1, r2
+        pld [r1]
+        RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+        ldmia r1, {r4-r5}
+        add r1, r1, r2
+        stmia r0, {r8-r9}
+        add r0, r0, r2
+        pld [r1]
+        RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+        subs r3, r3, #1
+        stmia r0, {r8-r9}
+        add r0, r0, r2
+        bne 6b
+        ldmfd sp!, {r4-r11,pc}
+        .align 8
+2:
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+6:      ldmia r1, {r7-r9}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
+        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+        subs r3, r3, #1
+        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        bne 6b
+        ldmfd sp!, {r4-r11,pc}
+        .align 8
+3:
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+6:      ldmia r1, {r7-r9}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
+        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+        subs r3, r3, #1
+        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        bne 6b
+        ldmfd sp!, {r4-r11,pc}
+        .align 8
+4:
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+6:      ldmia r1, {r7-r9}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
+        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+        subs r3, r3, #1
+        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        bne 6b
+        ldmfd sp!, {r4-r11,pc}
+
+        .align 8
+5:
+        .word 0xFEFEFEFE
+        .word 2b
+        .word 3b
+        .word 4b
+        .endfunc
+
+        .align 8
+function put_no_rnd_pixels8_y2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld [r1]
+        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+        adr r5, 5f
+        ands r4, r1, #3
+        mov r3, r3, lsr #1
+        ldr r12, [r5]
+        add r5, r5, r4, lsl #2
+        bic r1, r1, #3
+        ldrne pc, [r5]
+1:
+        ldmia r1, {r4-r5}
+        add r1, r1, r2
+6:      ldmia r1, {r6-r7}
+        add r1, r1, r2
+        pld [r1]
+        NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+        ldmia r1, {r4-r5}
+        add r1, r1, r2
+        stmia r0, {r8-r9}
+        add r0, r0, r2
+        pld [r1]
+        NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+        subs r3, r3, #1
+        stmia r0, {r8-r9}
+        add r0, r0, r2
+        bne 6b
+        ldmfd sp!, {r4-r11,pc}
+        .align 8
+2:
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+6:      ldmia r1, {r7-r9}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
+        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+        subs r3, r3, #1
+        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        bne 6b
+        ldmfd sp!, {r4-r11,pc}
+        .align 8
+3:
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+6:      ldmia r1, {r7-r9}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
+        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+        subs r3, r3, #1
+        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        bne 6b
+        ldmfd sp!, {r4-r11,pc}
+        .align 8
+4:
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+6:      ldmia r1, {r7-r9}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
+        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        ldmia r1, {r4-r6}
+        add r1, r1, r2
+        pld [r1]
+        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+        subs r3, r3, #1
+        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+        stmia r0, {r10-r11}
+        add r0, r0, r2
+        bne 6b
+        ldmfd sp!, {r4-r11,pc}
+        .align 8
+5:
+        .word 0xFEFEFEFE
+        .word 2b
+        .word 3b
+        .word 4b
+        .endfunc
+
+@ ----------------------------------------------------------------
+.macro  RND_XY2_IT align
+        @ l1=  (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
+        @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
+.if \align == 0
+        ldmia r1, {r6-r8}
+.elseif \align == 3
+        ldmia r1, {r5-r7}
+.else
+        ldmia r1, {r8-r10}
+.endif
+        add r1, r1, r2
+        pld [r1]
+.if \align == 0
+        ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
+.elseif \align == 1
+        ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
+        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
+.elseif \align == 2
+        ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
+        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
+.elseif \align == 3
+        ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
+.endif
+        ldr r14, [r12, #0]      @ 0x03030303
+        tst r3, #1
+        and r8, r4, r14
+        and r9, r5, r14
+        and r10, r6, r14
+        and r11, r7, r14
+        ldreq r14, [r12, #16]   @ 0x02020202/0x01010101
+        add r8, r8, r10
+        add r9, r9, r11
+        addeq r8, r8, r14
+        addeq r9, r9, r14
+        ldr r14, [r12, #20]     @ 0xFCFCFCFC >> 2
+        and r4, r14, r4, lsr #2
+        and r5, r14, r5, lsr #2
+        and r6, r14, r6, lsr #2
+        and r7, r14, r7, lsr #2
+        add r10, r4, r6
+        add r11, r5, r7
+        subs r3, r3, #1
+.endm
+
+.macro RND_XY2_EXPAND align
+        RND_XY2_IT \align
+6:      stmfd sp!, {r8-r11}
+        RND_XY2_IT \align
+        ldmfd sp!, {r4-r7}
+        add r4, r4, r8
+        add r5, r5, r9
+        add r6, r6, r10
+        add r7, r7, r11
+        ldr r14, [r12, #24]     @ 0x0F0F0F0F
+        and r4, r14, r4, lsr #2
+        and r5, r14, r5, lsr #2
+        add r4, r4, r6
+        add r5, r5, r7
+        stmia r0, {r4-r5}
+        add r0, r0, r2
+        bge 6b
+        ldmfd sp!, {r4-r11,pc}
+.endm
+
+        .align 8
+function put_pixels8_xy2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld [r1]
+        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+        adrl r12, 5f
+        ands r4, r1, #3
+        add r5, r12, r4, lsl #2
+        bic r1, r1, #3
+        ldrne pc, [r5]
+1:
+        RND_XY2_EXPAND 0
+
+        .align 8
+2:
+        RND_XY2_EXPAND 1
+
+        .align 8
+3:
+        RND_XY2_EXPAND 2
+
+        .align 8
+4:
+        RND_XY2_EXPAND 3
+
+5:
+        .word 0x03030303
+        .word 2b
+        .word 3b
+        .word 4b
+        .word 0x02020202
+        .word 0xFCFCFCFC >> 2
+        .word 0x0F0F0F0F
+        .endfunc
+
+        .align 8
+function put_no_rnd_pixels8_xy2_arm, export=1
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+        @ block = word aligned, pixles = unaligned
+        pld [r1]
+        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+        adrl r12, 5f
+        ands r4, r1, #3
+        add r5, r12, r4, lsl #2
+        bic r1, r1, #3
+        ldrne pc, [r5]
+1:
+        RND_XY2_EXPAND 0
+
+        .align 8
+2:
+        RND_XY2_EXPAND 1
+
+        .align 8
+3:
+        RND_XY2_EXPAND 2
+
+        .align 8
+4:
+        RND_XY2_EXPAND 3
+
+5:
+        .word 0x03030303
+        .word 2b
+        .word 3b
+        .word 4b
+        .word 0x01010101
+        .word 0xFCFCFCFC >> 2
+        .word 0x0F0F0F0F
+        .endfunc
+
+@ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride)
+function ff_add_pixels_clamped_ARM, export=1
+        push            {r4-r10}
+        mov             r10, #8
+1:
+        ldr             r4,  [r1]               /* load dest */
+        /* block[0] and block[1]*/
+        ldrsh           r5,  [r0]
+        ldrsh           r7,  [r0, #2]
+        and             r6,  r4,  #0xFF
+        and             r8,  r4,  #0xFF00
+        add             r6,  r5,  r6
+        add             r8,  r7,  r8,  lsr #8
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        movne           r8,  r7,  lsr #24
+        mov             r9,  r6
+        ldrsh           r5,  [r0, #4]           /* moved form [A] */
+        orr             r9,  r9,  r8, lsl #8
+        /* block[2] and block[3] */
+        /* [A] */
+        ldrsh           r7,  [r0, #6]
+        and             r6,  r4,  #0xFF0000
+        and             r8,  r4,  #0xFF000000
+        add             r6,  r5,  r6, lsr #16
+        add             r8,  r7,  r8, lsr #24
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        movne           r8,  r7,  lsr #24
+        orr             r9,  r9,  r6, lsl #16
+        ldr             r4,  [r1, #4]           /* moved form [B] */
+        orr             r9,  r9,  r8, lsl #24
+        /* store dest */
+        ldrsh           r5,  [r0, #8]           /* moved form [C] */
+        str             r9,  [r1]
+
+        /* load dest */
+        /* [B] */
+        /* block[4] and block[5] */
+        /* [C] */
+        ldrsh           r7,  [r0, #10]
+        and             r6,  r4,  #0xFF
+        and             r8,  r4,  #0xFF00
+        add             r6,  r5,  r6
+        add             r8,  r7,  r8, lsr #8
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        movne           r8,  r7,  lsr #24
+        mov             r9,  r6
+        ldrsh           r5,  [r0, #12]          /* moved from [D] */
+        orr             r9,  r9,  r8, lsl #8
+        /* block[6] and block[7] */
+        /* [D] */
+        ldrsh           r7,  [r0, #14]
+        and             r6,  r4,  #0xFF0000
+        and             r8,  r4,  #0xFF000000
+        add             r6,  r5,  r6, lsr #16
+        add             r8,  r7,  r8, lsr #24
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        movne           r8,  r7,  lsr #24
+        orr             r9,  r9,  r6, lsl #16
+        add             r0,  r0,  #16           /* moved from [E] */
+        orr             r9,  r9,  r8, lsl #24
+        subs            r10, r10, #1            /* moved from [F] */
+        /* store dest */
+        str             r9,  [r1, #4]
+
+        /* [E] */
+        /* [F] */
+        add             r1,  r1,  r2
+        bne             1b
+
+        pop             {r4-r10}
+        bx              lr
+        .endfunc
diff --git a/libavcodec/armv4l/dsputil_iwmmxt.c b/libavcodec/arm/dsputil_iwmmxt.c
similarity index 100%
rename from libavcodec/armv4l/dsputil_iwmmxt.c
rename to libavcodec/arm/dsputil_iwmmxt.c
diff --git a/libavcodec/arm/dsputil_iwmmxt_rnd_template.c b/libavcodec/arm/dsputil_iwmmxt_rnd_template.c
new file mode 100644
index 0000000..35a5a9b
--- /dev/null
+++ b/libavcodec/arm/dsputil_iwmmxt_rnd_template.c
@@ -0,0 +1,1114 @@
+/*
+ * iWMMXt optimized DSP utils
+ * copyright (c) 2004 AGAWA Koji
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "and r12, %[pixels], #7 \n\t"
+        "bic %[pixels], %[pixels], #7 \n\t"
+        "tmcr wcgr1, r12 \n\t"
+        "add r4, %[pixels], %[line_size] \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1: \n\t"
+        "wldrd wr0, [%[pixels]] \n\t"
+        "subs %[h], %[h], #2 \n\t"
+        "wldrd wr1, [%[pixels], #8] \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4] \n\t"
+        "pld [%[pixels]] \n\t"
+        "pld [%[pixels], #32] \n\t"
+        "wldrd wr4, [r4, #8] \n\t"
+        "add r4, r4, %[line_size] \n\t"
+        "walignr1 wr8, wr0, wr1 \n\t"
+        "pld [r4] \n\t"
+        "pld [r4, #32] \n\t"
+        "walignr1 wr10, wr3, wr4 \n\t"
+        "wstrd wr8, [%[block]] \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5] \n\t"
+        "add r5, r5, %[line_size] \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "and r12, %[pixels], #7 \n\t"
+        "bic %[pixels], %[pixels], #7 \n\t"
+        "tmcr wcgr1, r12 \n\t"
+        "add r4, %[pixels], %[line_size] \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1: \n\t"
+        "wldrd wr0, [%[pixels]] \n\t"
+        "subs %[h], %[h], #2 \n\t"
+        "wldrd wr1, [%[pixels], #8] \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4] \n\t"
+        "pld [%[pixels]] \n\t"
+        "pld [%[pixels], #32] \n\t"
+        "wldrd wr4, [r4, #8] \n\t"
+        "add r4, r4, %[line_size] \n\t"
+        "walignr1 wr8, wr0, wr1 \n\t"
+        "wldrd wr0, [%[block]] \n\t"
+        "wldrd wr2, [r5] \n\t"
+        "pld [r4] \n\t"
+        "pld [r4, #32] \n\t"
+        "walignr1 wr10, wr3, wr4 \n\t"
+        WAVG2B" wr8, wr8, wr0 \n\t"
+        WAVG2B" wr10, wr10, wr2 \n\t"
+        "wstrd wr8, [%[block]] \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5] \n\t"
+        "pld [%[block]] \n\t"
+        "pld [%[block], #32] \n\t"
+        "add r5, r5, %[line_size] \n\t"
+        "pld [r5] \n\t"
+        "pld [r5, #32] \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "and r12, %[pixels], #7 \n\t"
+        "bic %[pixels], %[pixels], #7 \n\t"
+        "tmcr wcgr1, r12 \n\t"
+        "add r4, %[pixels], %[line_size] \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1: \n\t"
+        "wldrd wr0, [%[pixels]] \n\t"
+        "wldrd wr1, [%[pixels], #8] \n\t"
+        "subs %[h], %[h], #2 \n\t"
+        "wldrd wr2, [%[pixels], #16] \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4] \n\t"
+        "pld [%[pixels]] \n\t"
+        "pld [%[pixels], #32] \n\t"
+        "walignr1 wr8, wr0, wr1 \n\t"
+        "wldrd wr4, [r4, #8] \n\t"
+        "walignr1 wr9, wr1, wr2 \n\t"
+        "wldrd wr5, [r4, #16] \n\t"
+        "add r4, r4, %[line_size] \n\t"
+        "pld [r4] \n\t"
+        "pld [r4, #32] \n\t"
+        "walignr1 wr10, wr3, wr4 \n\t"
+        "wstrd wr8, [%[block]] \n\t"
+        "walignr1 wr11, wr4, wr5 \n\t"
+        "wstrd wr9, [%[block], #8] \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5] \n\t"
+        "wstrd wr11, [r5, #8] \n\t"
+        "add r5, r5, %[line_size] \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    __asm__ volatile (
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "1:                             \n\t"
+        "wldrd wr0, [%[pixels]]         \n\t"
+        "wldrd wr1, [%[pixels], #8]     \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wldrd wr2, [%[pixels], #16]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr3, [r4]                \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr8, wr0, wr1         \n\t"
+        "wldrd wr4, [r4, #8]            \n\t"
+        "walignr1 wr9, wr1, wr2         \n\t"
+        "wldrd wr5, [r4, #16]           \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "wldrd wr0, [%[block]]          \n\t"
+        "pld [r4]                       \n\t"
+        "wldrd wr1, [%[block], #8]      \n\t"
+        "pld [r4, #32]                  \n\t"
+        "wldrd wr2, [r5]                \n\t"
+        "walignr1 wr10, wr3, wr4        \n\t"
+        "wldrd wr3, [r5, #8]            \n\t"
+        WAVG2B" wr8, wr8, wr0           \n\t"
+        WAVG2B" wr9, wr9, wr1           \n\t"
+        WAVG2B" wr10, wr10, wr2         \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "walignr1 wr11, wr4, wr5        \n\t"
+        WAVG2B" wr11, wr11, wr3         \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size] \n\t"
+        "wstrd wr10, [r5]               \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "wstrd wr11, [r5, #8]           \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+        "bne 1b \n\t"
+        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+        :
+        : "memory", "r4", "r5", "r12");
+}
+
+void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wldrd wr15, [r4, #16]          \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "walignr1 wr3, wr14, wr15       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr5, wr12               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "wmoveq wr7, wr15               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "walignr2ne wr5, wr11, wr12     \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        "walignr2ne wr7, wr14, wr15     \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        WAVG2B" wr1, wr1, wr5           \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        "wstrd wr1, [%[block], #8]      \n\t"
+        WAVG2B" wr3, wr3, wr7           \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr3, [r5, #8]            \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        "wldrd wr12, [r5]               \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        WAVG2B" wr0, wr0, wr10          \n\t"
+        WAVG2B" wr2, wr2, wr12          \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "add r4, %[pixels], %[line_size]\n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add r5, %[block], %[line_size] \n\t"
+        "mov %[line_size], %[line_size], lsl #1 \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "wldrd wr13, [r4]               \n\t"
+        "pld [%[pixels]]                \n\t"
+        "wldrd wr14, [r4, #8]           \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wldrd wr15, [r4, #16]          \n\t"
+        "add r4, r4, %[line_size]       \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "pld [r4]                       \n\t"
+        "pld [r4, #32]                  \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        "walignr1 wr2, wr13, wr14       \n\t"
+        "walignr1 wr3, wr14, wr15       \n\t"
+        "wmoveq wr4, wr11               \n\t"
+        "wmoveq wr5, wr12               \n\t"
+        "wmoveq wr6, wr14               \n\t"
+        "wmoveq wr7, wr15               \n\t"
+        "walignr2ne wr4, wr10, wr11     \n\t"
+        "walignr2ne wr5, wr11, wr12     \n\t"
+        "walignr2ne wr6, wr13, wr14     \n\t"
+        "walignr2ne wr7, wr14, wr15     \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        WAVG2B" wr0, wr0, wr4           \n\t"
+        "wldrd wr11, [%[block], #8]     \n\t"
+        WAVG2B" wr1, wr1, wr5           \n\t"
+        "wldrd wr12, [r5]               \n\t"
+        WAVG2B" wr2, wr2, wr6           \n\t"
+        "wldrd wr13, [r5, #8]           \n\t"
+        WAVG2B" wr3, wr3, wr7           \n\t"
+        WAVG2B" wr0, wr0, wr10          \n\t"
+        WAVG2B" wr1, wr1, wr11          \n\t"
+        WAVG2B" wr2, wr2, wr12          \n\t"
+        WAVG2B" wr3, wr3, wr13          \n\t"
+        "wstrd wr0, [%[block]]          \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr1, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "wstrd wr2, [r5]                \n\t"
+        "pld [%[block]]                 \n\t"
+        "wstrd wr3, [r5, #8]            \n\t"
+        "add r5, r5, %[line_size]       \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [r5]                       \n\t"
+        "pld [r5, #32]                  \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        :"r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    __asm__ volatile(
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+        "and            r12, %[pixels], #7                      \n\t"
+        "tmcr           wcgr1, r12                              \n\t"
+        "bic            %[pixels], %[pixels], #7                \n\t"
+
+        "wldrd          wr10, [%[pixels]]                       \n\t"
+        "wldrd          wr11, [%[pixels], #8]                   \n\t"
+        "pld            [%[block]]                              \n\t"
+        "add            %[pixels], %[pixels], %[line_size]      \n\t"
+        "walignr1       wr0, wr10, wr11                         \n\t"
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+
+      "1:                                                       \n\t"
+        "wldrd          wr10, [%[pixels]]                       \n\t"
+        "wldrd          wr11, [%[pixels], #8]                   \n\t"
+        "add            %[pixels], %[pixels], %[line_size]      \n\t"
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+        "walignr1       wr4, wr10, wr11                         \n\t"
+        "wldrd          wr10, [%[block]]                        \n\t"
+         WAVG2B"        wr8, wr0, wr4                           \n\t"
+         WAVG2B"        wr8, wr8, wr10                          \n\t"
+        "wstrd          wr8, [%[block]]                         \n\t"
+        "add            %[block], %[block], %[line_size]        \n\t"
+
+        "wldrd          wr10, [%[pixels]]                       \n\t"
+        "wldrd          wr11, [%[pixels], #8]                   \n\t"
+        "pld            [%[block]]                              \n\t"
+        "add            %[pixels], %[pixels], %[line_size]      \n\t"
+        "pld            [%[pixels]]                             \n\t"
+        "pld            [%[pixels], #32]                        \n\t"
+        "walignr1       wr0, wr10, wr11                         \n\t"
+        "wldrd          wr10, [%[block]]                        \n\t"
+         WAVG2B"        wr8, wr0, wr4                           \n\t"
+         WAVG2B"        wr8, wr8, wr10                          \n\t"
+        "wstrd          wr8, [%[block]]                         \n\t"
+        "add            %[block], %[block], %[line_size]        \n\t"
+
+        "subs           %[h], %[h], #2                          \n\t"
+        "pld            [%[block]]                              \n\t"
+        "bne            1b                                      \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "cc", "memory", "r12");
+}
+
+void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr4, wr10, wr11       \n\t"
+        "walignr1 wr5, wr11, wr12       \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "subs %[h], %[h], #2            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    int stride = line_size;
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "and r12, %[pixels], #7         \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "pld [%[block]]                 \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+
+        "1:                             \n\t"
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr4, wr10, wr11       \n\t"
+        "walignr1 wr5, wr11, wr12       \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        "wldrd wr11, [%[block], #8]     \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        WAVG2B" wr8, wr8, wr10          \n\t"
+        WAVG2B" wr9, wr9, wr11          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "wldrd wr10, [%[pixels]]        \n\t"
+        "wldrd wr11, [%[pixels], #8]    \n\t"
+        "pld [%[block]]                 \n\t"
+        "wldrd wr12, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr0, wr10, wr11       \n\t"
+        "walignr1 wr1, wr11, wr12       \n\t"
+        "wldrd wr10, [%[block]]         \n\t"
+        "wldrd wr11, [%[block], #8]     \n\t"
+        WAVG2B" wr8, wr0, wr4           \n\t"
+        WAVG2B" wr9, wr1, wr5           \n\t"
+        WAVG2B" wr8, wr8, wr10          \n\t"
+        WAVG2B" wr9, wr9, wr11          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "subs %[h], %[h], #2            \n\t"
+        "pld [%[block]]                 \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+        :
+        : "r4", "r5", "r12", "memory");
+}
+
+void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "add r12, r12, #1               \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "cmp r12, #8                    \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
+
+void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        /* alignment */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "tmcr wcgr2, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr7, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr6, wr7            \n\t"
+        "wunpckehub wr7, wr7            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr6, wr6, wr10         \n\t"
+        "waddhus wr7, wr7, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        "subs %[h], %[h], #2            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
+
+void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7   \n\t"
+        "tmcr wcgr1, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "add r12, r12, #1               \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "tmcr wcgr2, r12                \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "cmp r12, #8                    \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
+
+void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+    // [wr0 wr1 wr2 wr3] for previous line
+    // [wr4 wr5 wr6 wr7] for current line
+    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "pld [%[pixels]]                \n\t"
+        "mov r12, #2                    \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "tmcr wcgr0, r12                \n\t" /* for shift value */
+        /* alignment */
+        "and r12, %[pixels], #7         \n\t"
+        "bic %[pixels], %[pixels], #7           \n\t"
+        "tmcr wcgr1, r12                \n\t"
+        "add r12, r12, #1               \n\t"
+        "tmcr wcgr2, r12                \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "pld [%[pixels]]                \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+
+        "1:                             \n\t"
+        // [wr0 wr1 wr2 wr3]
+        // [wr4 wr5 wr6 wr7] <= *
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "cmp r12, #8                    \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr6, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr7, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr4, wr6            \n\t"
+        "wunpckehub wr5, wr6            \n\t"
+        "wunpckelub wr6, wr7            \n\t"
+        "wunpckehub wr7, wr7            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr4, wr4, wr8          \n\t"
+        "waddhus wr5, wr5, wr9          \n\t"
+        "waddhus wr6, wr6, wr10         \n\t"
+        "waddhus wr7, wr7, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wldrd wr13, [%[block], #8]     \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        WAVG2B" wr9, wr9, wr13          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+
+        // [wr0 wr1 wr2 wr3] <= *
+        // [wr4 wr5 wr6 wr7]
+        "wldrd wr12, [%[pixels]]        \n\t"
+        "pld [%[block]]                 \n\t"
+        "wldrd wr13, [%[pixels], #8]    \n\t"
+        "pld [%[block], #32]            \n\t"
+        "wldrd wr14, [%[pixels], #16]   \n\t"
+        "add %[pixels], %[pixels], %[line_size] \n\t"
+        "walignr1 wr2, wr12, wr13       \n\t"
+        "pld [%[pixels]]                \n\t"
+        "pld [%[pixels], #32]           \n\t"
+        "walignr1 wr3, wr13, wr14       \n\t"
+        "wmoveq wr10, wr13              \n\t"
+        "wmoveq wr11, wr14              \n\t"
+        "walignr2ne wr10, wr12, wr13    \n\t"
+        "walignr2ne wr11, wr13, wr14    \n\t"
+        "wunpckelub wr0, wr2            \n\t"
+        "wunpckehub wr1, wr2            \n\t"
+        "wunpckelub wr2, wr3            \n\t"
+        "wunpckehub wr3, wr3            \n\t"
+        "wunpckelub wr8, wr10           \n\t"
+        "wunpckehub wr9, wr10           \n\t"
+        "wunpckelub wr10, wr11          \n\t"
+        "wunpckehub wr11, wr11          \n\t"
+        "waddhus wr0, wr0, wr8          \n\t"
+        "waddhus wr1, wr1, wr9          \n\t"
+        "waddhus wr2, wr2, wr10         \n\t"
+        "waddhus wr3, wr3, wr11         \n\t"
+        "waddhus wr8, wr0, wr4          \n\t"
+        "waddhus wr9, wr1, wr5          \n\t"
+        "waddhus wr10, wr2, wr6         \n\t"
+        "waddhus wr11, wr3, wr7         \n\t"
+        "waddhus wr8, wr8, wr15         \n\t"
+        "waddhus wr9, wr9, wr15         \n\t"
+        "waddhus wr10, wr10, wr15       \n\t"
+        "waddhus wr11, wr11, wr15       \n\t"
+        "wsrlhg wr8, wr8, wcgr0         \n\t"
+        "wsrlhg wr9, wr9, wcgr0         \n\t"
+        "wldrd wr12, [%[block]]         \n\t"
+        "wldrd wr13, [%[block], #8]     \n\t"
+        "wsrlhg wr10, wr10, wcgr0       \n\t"
+        "wsrlhg wr11, wr11, wcgr0       \n\t"
+        "wpackhus wr8, wr8, wr9         \n\t"
+        "wpackhus wr9, wr10, wr11       \n\t"
+        WAVG2B" wr8, wr8, wr12          \n\t"
+        WAVG2B" wr9, wr9, wr13          \n\t"
+        "wstrd wr8, [%[block]]          \n\t"
+        "wstrd wr9, [%[block], #8]      \n\t"
+        "add %[block], %[block], %[line_size]   \n\t"
+        "subs %[h], %[h], #2            \n\t"
+        "pld [%[block]]                 \n\t"
+        "pld [%[block], #32]            \n\t"
+        "bne 1b                         \n\t"
+        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+        : [line_size]"r"(line_size)
+        : "r12", "memory");
+}
diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
new file mode 100644
index 0000000..e50e160
--- /dev/null
+++ b/libavcodec/arm/dsputil_neon.c
@@ -0,0 +1,189 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+
+void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+
+void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+
+void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int);
+
+void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int);
+
+void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+
+void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+                                       int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+                                       int beta, int8_t *tc0);
+
+void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
+                             DCTELEM *block, int stride,
+                             const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
+                                  DCTELEM *block, int stride,
+                                  const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
+                            DCTELEM *block, int stride,
+                            const uint8_t nnzc[6*8]);
+
+void ff_float_to_int16_neon(int16_t *, const float *, long);
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+
+void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+{
+    c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
+    c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+
+    c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+    c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+
+    c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
+    c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
+
+    c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
+    c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
+    c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
+    c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
+    c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
+    c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
+    c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
+    c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
+    c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
+    c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
+    c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
+    c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
+    c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
+    c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
+    c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
+    c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
+
+    c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
+    c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
+    c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
+    c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
+    c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
+    c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
+    c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
+    c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
+    c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
+    c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
+    c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
+    c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
+    c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
+    c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
+    c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
+    c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
+
+    c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
+
+    c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
+    c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
+    c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+    c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+
+    c->h264_idct_add = ff_h264_idct_add_neon;
+    c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
+    c->h264_idct_add16      = ff_h264_idct_add16_neon;
+    c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+    c->h264_idct_add8       = ff_h264_idct_add8_neon;
+
+    if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+        c->float_to_int16 = ff_float_to_int16_neon;
+        c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
+    }
+}
diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
new file mode 100644
index 0000000..4f86714
--- /dev/null
+++ b/libavcodec/arm/dsputil_neon_s.S
@@ -0,0 +1,611 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+        .fpu neon
+        .text
+
+        .macro pixels16 avg=0
+.if \avg
+        mov             ip,  r0
+.endif
+1:      vld1.64         {d0, d1},  [r1], r2
+        vld1.64         {d2, d3},  [r1], r2
+        vld1.64         {d4, d5},  [r1], r2
+        pld             [r1, r2, lsl #2]
+        vld1.64         {d6, d7},  [r1], r2
+        pld             [r1]
+        pld             [r1, r2]
+        pld             [r1, r2, lsl #1]
+.if \avg
+        vld1.64         {d16,d17}, [ip], r2
+        vrhadd.u8       q0,  q0,  q8
+        vld1.64         {d18,d19}, [ip], r2
+        vrhadd.u8       q1,  q1,  q9
+        vld1.64         {d20,d21}, [ip], r2
+        vrhadd.u8       q2,  q2,  q10
+        vld1.64         {d22,d23}, [ip], r2
+        vrhadd.u8       q3,  q3,  q11
+.endif
+        subs            r3,  r3,  #4
+        vst1.64         {d0, d1},  [r0,:128], r2
+        vst1.64         {d2, d3},  [r0,:128], r2
+        vst1.64         {d4, d5},  [r0,:128], r2
+        vst1.64         {d6, d7},  [r0,:128], r2
+        bne             1b
+        bx              lr
+        .endm
+
+        .macro pixels16_x2 vhadd=vrhadd.u8
+1:      vld1.64         {d0-d2},   [r1], r2
+        vld1.64         {d4-d6},   [r1], r2
+        pld             [r1]
+        pld             [r1, r2]
+        subs            r3,  r3,  #2
+        vext.8          q1,  q0,  q1,  #1
+        \vhadd          q0,  q0,  q1
+        vext.8          q3,  q2,  q3,  #1
+        \vhadd          q2,  q2,  q3
+        vst1.64         {d0, d1},  [r0,:128], r2
+        vst1.64         {d4, d5},  [r0,:128], r2
+        bne             1b
+        bx              lr
+        .endm
+
+        .macro pixels16_y2 vhadd=vrhadd.u8
+        push            {lr}
+        add             ip,  r1,  r2
+        lsl             lr,  r2,  #1
+        vld1.64         {d0, d1},  [r1], lr
+        vld1.64         {d2, d3},  [ip], lr
+1:      subs            r3,  r3,  #2
+        \vhadd          q2,  q0,  q1
+        vld1.64         {d0, d1},  [r1],      lr
+        \vhadd          q3,  q0,  q1
+        vld1.64         {d2, d3},  [ip],      lr
+        pld             [r1]
+        pld             [ip]
+        vst1.64         {d4, d5},  [r0,:128], r2
+        vst1.64         {d6, d7},  [r0,:128], r2
+        bne             1b
+        pop             {pc}
+        .endm
+
+        .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
+        push            {lr}
+        lsl             lr,  r2,  #1
+        add             ip,  r1,  r2
+        vld1.64         {d0-d2},   [r1], lr
+        vld1.64         {d4-d6},   [ip], lr
+.if \no_rnd
+        vmov.i16        q13, #1
+.endif
+        pld             [r1]
+        pld             [ip]
+        vext.8          q1,  q0,  q1,  #1
+        vext.8          q3,  q2,  q3,  #1
+        vaddl.u8        q8,  d0,  d2
+        vaddl.u8        q10, d1,  d3
+        vaddl.u8        q9,  d4,  d6
+        vaddl.u8        q11, d5,  d7
+1:      subs            r3,  r3,  #2
+        vld1.64         {d0-d2},   [r1], lr
+        vadd.u16        q12, q8,  q9
+        pld             [r1]
+.if \no_rnd
+        vadd.u16        q12, q12, q13
+.endif
+        vext.8          q15, q0,  q1,  #1
+        vadd.u16        q1 , q10, q11
+        \vshrn          d28, q12, #2
+.if \no_rnd
+        vadd.u16        q1,  q1,  q13
+.endif
+        \vshrn          d29, q1,  #2
+        vaddl.u8        q8,  d0,  d30
+        vld1.64         {d2-d4},   [ip], lr
+        vaddl.u8        q10, d1,  d31
+        vst1.64         {d28,d29}, [r0,:128], r2
+        vadd.u16        q12, q8,  q9
+        pld             [ip]
+.if \no_rnd
+        vadd.u16        q12, q12, q13
+.endif
+        vext.8          q2,  q1,  q2,  #1
+        vadd.u16        q0,  q10, q11
+        \vshrn          d30, q12, #2
+.if \no_rnd
+        vadd.u16        q0,  q0,  q13
+.endif
+        \vshrn          d31, q0,  #2
+        vaddl.u8        q9,  d2,  d4
+        vaddl.u8        q11, d3,  d5
+        vst1.64         {d30,d31}, [r0,:128], r2
+        bgt             1b
+        pop             {pc}
+        .endm
+
+        .macro pixels8
+1:      vld1.64         {d0}, [r1], r2
+        vld1.64         {d1}, [r1], r2
+        vld1.64         {d2}, [r1], r2
+        pld             [r1, r2, lsl #2]
+        vld1.64         {d3}, [r1], r2
+        pld             [r1]
+        pld             [r1, r2]
+        pld             [r1, r2, lsl #1]
+        subs            r3,  r3,  #4
+        vst1.64         {d0}, [r0,:64], r2
+        vst1.64         {d1}, [r0,:64], r2
+        vst1.64         {d2}, [r0,:64], r2
+        vst1.64         {d3}, [r0,:64], r2
+        bne             1b
+        bx              lr
+        .endm
+
+        .macro pixels8_x2 vhadd=vrhadd.u8
+1:      vld1.64         {d0, d1},  [r1], r2
+        vext.8          d1,  d0,  d1,  #1
+        vld1.64         {d2, d3},  [r1], r2
+        vext.8          d3,  d2,  d3,  #1
+        pld             [r1]
+        pld             [r1, r2]
+        subs            r3,  r3,  #2
+        vswp            d1,  d2
+        \vhadd          q0,  q0,  q1
+        vst1.64         {d0},      [r0,:64], r2
+        vst1.64         {d1},      [r0,:64], r2
+        bne             1b
+        bx              lr
+        .endm
+
+        .macro pixels8_y2 vhadd=vrhadd.u8
+        push            {lr}
+        add             ip,  r1,  r2
+        lsl             lr,  r2,  #1
+        vld1.64         {d0},      [r1], lr
+        vld1.64         {d1},      [ip], lr
+1:      subs            r3,  r3,  #2
+        \vhadd          d4,  d0,  d1
+        vld1.64         {d0},      [r1],     lr
+        \vhadd          d5,  d0,  d1
+        vld1.64         {d1},      [ip],     lr
+        pld             [r1]
+        pld             [ip]
+        vst1.64         {d4},      [r0,:64], r2
+        vst1.64         {d5},      [r0,:64], r2
+        bne             1b
+        pop             {pc}
+        .endm
+
+        .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
+        push            {lr}
+        lsl             lr,  r2,  #1
+        add             ip,  r1,  r2
+        vld1.64         {d0, d1},  [r1], lr
+        vld1.64         {d2, d3},  [ip], lr
+.if \no_rnd
+        vmov.i16        q11, #1
+.endif
+        pld             [r1]
+        pld             [ip]
+        vext.8          d4,  d0,  d1,  #1
+        vext.8          d6,  d2,  d3,  #1
+        vaddl.u8        q8,  d0,  d4
+        vaddl.u8        q9,  d2,  d6
+1:      subs            r3,  r3,  #2
+        vld1.64         {d0, d1},  [r1], lr
+        pld             [r1]
+        vadd.u16        q10, q8,  q9
+        vext.8          d4,  d0,  d1,  #1
+.if \no_rnd
+        vadd.u16        q10, q10, q11
+.endif
+        vaddl.u8        q8,  d0,  d4
+        \vshrn          d5,  q10, #2
+        vld1.64         {d2, d3},  [ip], lr
+        vadd.u16        q10, q8,  q9
+        pld             [ip]
+.if \no_rnd
+        vadd.u16        q10, q10, q11
+.endif
+        vst1.64         {d5},      [r0,:64], r2
+        \vshrn          d7,  q10, #2
+        vext.8          d6,  d2,  d3,  #1
+        vaddl.u8        q9,  d2,  d6
+        vst1.64         {d7},      [r0,:64], r2
+        bgt             1b
+        pop             {pc}
+        .endm
+
+        .macro pixfunc pfx name suf rnd_op args:vararg
+function ff_\pfx\name\suf\()_neon, export=1
+        \name \rnd_op \args
+        .endfunc
+        .endm
+
+        .macro pixfunc2 pfx name args:vararg
+        pixfunc \pfx \name
+        pixfunc \pfx \name \args
+        .endm
+
+function ff_put_h264_qpel16_mc00_neon, export=1
+        mov   r3, #16
+        .endfunc
+
+        pixfunc  put_ pixels16
+        pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
+        pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
+        pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
+
+function ff_avg_h264_qpel16_mc00_neon, export=1
+        mov   r3, #16
+        .endfunc
+
+        pixfunc  avg_ pixels16,, 1
+
+function ff_put_h264_qpel8_mc00_neon, export=1
+        mov   r3, #8
+        .endfunc
+
+        pixfunc  put_ pixels8
+        pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
+        pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
+        pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
+
+function ff_float_to_int16_neon, export=1
+        subs            r2,  r2,  #8
+        vld1.64         {d0-d1},  [r1,:128]!
+        vcvt.s32.f32    q8,  q0,  #16
+        vld1.64         {d2-d3},  [r1,:128]!
+        vcvt.s32.f32    q9,  q1,  #16
+        beq             3f
+        bics            ip,  r2,  #15
+        beq             2f
+1:      subs            ip,  ip,  #16
+        vshrn.s32       d4,  q8,  #16
+        vld1.64         {d0-d1},  [r1,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vshrn.s32       d5,  q9,  #16
+        vld1.64         {d2-d3},  [r1,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vshrn.s32       d6,  q0,  #16
+        vst1.64         {d4-d5},  [r0,:128]!
+        vshrn.s32       d7,  q1,  #16
+        vld1.64         {d16-d17},[r1,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r1,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.64         {d6-d7},  [r0,:128]!
+        bne             1b
+        ands            r2,  r2,  #15
+        beq             3f
+2:      vld1.64         {d0-d1},  [r1,:128]!
+        vshrn.s32       d4,  q8,  #16
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r1,:128]!
+        vshrn.s32       d5,  q9,  #16
+        vcvt.s32.f32    q1,  q1,  #16
+        vshrn.s32       d6,  q0,  #16
+        vst1.64         {d4-d5},  [r0,:128]!
+        vshrn.s32       d7,  q1,  #16
+        vst1.64         {d6-d7},  [r0,:128]!
+        bx              lr
+3:      vshrn.s32       d4,  q8,  #16
+        vshrn.s32       d5,  q9,  #16
+        vst1.64         {d4-d5},  [r0,:128]!
+        bx              lr
+        .endfunc
+
+function ff_float_to_int16_interleave_neon, export=1
+        cmp             r3, #2
+        ldrlt           r1, [r1]
+        blt             ff_float_to_int16_neon
+        bne             4f
+
+        ldr             r3, [r1]
+        ldr             r1, [r1, #4]
+
+        subs            r2,  r2,  #8
+        vld1.64         {d0-d1},  [r3,:128]!
+        vcvt.s32.f32    q8,  q0,  #16
+        vld1.64         {d2-d3},  [r3,:128]!
+        vcvt.s32.f32    q9,  q1,  #16
+        vld1.64         {d20-d21},[r1,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r1,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        beq             3f
+        bics            ip,  r2,  #15
+        beq             2f
+1:      subs            ip,  ip,  #16
+        vld1.64         {d0-d1},  [r3,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vsri.32         q10, q8,  #16
+        vld1.64         {d2-d3},  [r3,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vld1.64         {d24-d25},[r1,:128]!
+        vcvt.s32.f32    q12, q12, #16
+        vld1.64         {d26-d27},[r1,:128]!
+        vsri.32         q11, q9,  #16
+        vst1.64         {d20-d21},[r0,:128]!
+        vcvt.s32.f32    q13, q13, #16
+        vst1.64         {d22-d23},[r0,:128]!
+        vsri.32         q12, q0,  #16
+        vld1.64         {d16-d17},[r3,:128]!
+        vsri.32         q13, q1,  #16
+        vst1.64         {d24-d25},[r0,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r3,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vld1.64         {d20-d21},[r1,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r1,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        vst1.64         {d26-d27},[r0,:128]!
+        bne             1b
+        ands            r2,  r2,  #15
+        beq             3f
+2:      vsri.32         q10, q8,  #16
+        vld1.64         {d0-d1},  [r3,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r3,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vld1.64         {d24-d25},[r1,:128]!
+        vcvt.s32.f32    q12, q12, #16
+        vsri.32         q11, q9,  #16
+        vld1.64         {d26-d27},[r1,:128]!
+        vcvt.s32.f32    q13, q13, #16
+        vst1.64         {d20-d21},[r0,:128]!
+        vsri.32         q12, q0,  #16
+        vst1.64         {d22-d23},[r0,:128]!
+        vsri.32         q13, q1,  #16
+        vst1.64         {d24-d27},[r0,:128]!
+        bx              lr
+3:      vsri.32         q10, q8,  #16
+        vsri.32         q11, q9,  #16
+        vst1.64         {d20-d23},[r0,:128]!
+        bx              lr
+
+4:      push            {r4-r8,lr}
+        cmp             r3,  #4
+        lsl             ip,  r3,  #1
+        blt             4f
+
+        @ 4 channels
+5:      ldmia           r1!, {r4-r7}
+        mov             lr,  r2
+        mov             r8,  r0
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vld1.64         {d20-d21},[r6,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r7,:128]!
+        vcvt.s32.f32    q11, q11, #16
+6:      subs            lr,  lr,  #8
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vsri.32         q9,  q8,  #16
+        vld1.64         {d2-d3},  [r5,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vsri.32         q11, q10, #16
+        vld1.64         {d4-d5},  [r6,:128]!
+        vcvt.s32.f32    q2,  q2,  #16
+        vzip.32         d18, d22
+        vld1.64         {d6-d7},  [r7,:128]!
+        vcvt.s32.f32    q3,  q3,  #16
+        vzip.32         d19, d23
+        vst1.64         {d18},    [r8], ip
+        vsri.32         q1,  q0,  #16
+        vst1.64         {d22},    [r8], ip
+        vsri.32         q3,  q2,  #16
+        vst1.64         {d19},    [r8], ip
+        vzip.32         d2,  d6
+        vst1.64         {d23},    [r8], ip
+        vzip.32         d3,  d7
+        beq             7f
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vst1.64         {d2},     [r8], ip
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.64         {d6},     [r8], ip
+        vld1.64         {d20-d21},[r6,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vst1.64         {d3},     [r8], ip
+        vld1.64         {d22-d23},[r7,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        vst1.64         {d7},     [r8], ip
+        b               6b
+7:      vst1.64         {d2},     [r8], ip
+        vst1.64         {d6},     [r8], ip
+        vst1.64         {d3},     [r8], ip
+        vst1.64         {d7},     [r8], ip
+        subs            r3,  r3,  #4
+        popeq           {r4-r8,pc}
+        cmp             r3,  #4
+        add             r0,  r0,  #8
+        bge             5b
+
+        @ 2 channels
+4:      cmp             r3,  #2
+        blt             4f
+        ldmia           r1!, {r4-r5}
+        mov             lr,  r2
+        mov             r8,  r0
+        tst             lr,  #8
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vld1.64         {d20-d21},[r4,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vld1.64         {d22-d23},[r5,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        beq             6f
+        subs            lr,  lr,  #8
+        beq             7f
+        vsri.32         d18, d16, #16
+        vsri.32         d19, d17, #16
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vst1.32         {d18[0]}, [r8], ip
+        vsri.32         d22, d20, #16
+        vst1.32         {d18[1]}, [r8], ip
+        vsri.32         d23, d21, #16
+        vst1.32         {d19[0]}, [r8], ip
+        vst1.32         {d19[1]}, [r8], ip
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.32         {d22[0]}, [r8], ip
+        vst1.32         {d22[1]}, [r8], ip
+        vld1.64         {d20-d21},[r4,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vst1.32         {d23[0]}, [r8], ip
+        vst1.32         {d23[1]}, [r8], ip
+        vld1.64         {d22-d23},[r5,:128]!
+        vcvt.s32.f32    q11, q11, #16
+6:      subs            lr,  lr,  #16
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vsri.32         d18, d16, #16
+        vld1.64         {d2-d3},  [r5,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        vsri.32         d19, d17, #16
+        vld1.64         {d4-d5},  [r4,:128]!
+        vcvt.s32.f32    q2,  q2,  #16
+        vld1.64         {d6-d7},  [r5,:128]!
+        vcvt.s32.f32    q3,  q3,  #16
+        vst1.32         {d18[0]}, [r8], ip
+        vsri.32         d22, d20, #16
+        vst1.32         {d18[1]}, [r8], ip
+        vsri.32         d23, d21, #16
+        vst1.32         {d19[0]}, [r8], ip
+        vsri.32         d2,  d0,  #16
+        vst1.32         {d19[1]}, [r8], ip
+        vsri.32         d3,  d1,  #16
+        vst1.32         {d22[0]}, [r8], ip
+        vsri.32         d6,  d4,  #16
+        vst1.32         {d22[1]}, [r8], ip
+        vsri.32         d7,  d5,  #16
+        vst1.32         {d23[0]}, [r8], ip
+        vst1.32         {d23[1]}, [r8], ip
+        beq             6f
+        vld1.64         {d16-d17},[r4,:128]!
+        vcvt.s32.f32    q8,  q8,  #16
+        vst1.32         {d2[0]},  [r8], ip
+        vst1.32         {d2[1]},  [r8], ip
+        vld1.64         {d18-d19},[r5,:128]!
+        vcvt.s32.f32    q9,  q9,  #16
+        vst1.32         {d3[0]},  [r8], ip
+        vst1.32         {d3[1]},  [r8], ip
+        vld1.64         {d20-d21},[r4,:128]!
+        vcvt.s32.f32    q10, q10, #16
+        vst1.32         {d6[0]},  [r8], ip
+        vst1.32         {d6[1]},  [r8], ip
+        vld1.64         {d22-d23},[r5,:128]!
+        vcvt.s32.f32    q11, q11, #16
+        vst1.32         {d7[0]},  [r8], ip
+        vst1.32         {d7[1]},  [r8], ip
+        bgt             6b
+6:      vst1.32         {d2[0]},  [r8], ip
+        vst1.32         {d2[1]},  [r8], ip
+        vst1.32         {d3[0]},  [r8], ip
+        vst1.32         {d3[1]},  [r8], ip
+        vst1.32         {d6[0]},  [r8], ip
+        vst1.32         {d6[1]},  [r8], ip
+        vst1.32         {d7[0]},  [r8], ip
+        vst1.32         {d7[1]},  [r8], ip
+        b               8f
+7:      vsri.32         d18, d16, #16
+        vsri.32         d19, d17, #16
+        vst1.32         {d18[0]}, [r8], ip
+        vsri.32         d22, d20, #16
+        vst1.32         {d18[1]}, [r8], ip
+        vsri.32         d23, d21, #16
+        vst1.32         {d19[0]}, [r8], ip
+        vst1.32         {d19[1]}, [r8], ip
+        vst1.32         {d22[0]}, [r8], ip
+        vst1.32         {d22[1]}, [r8], ip
+        vst1.32         {d23[0]}, [r8], ip
+        vst1.32         {d23[1]}, [r8], ip
+8:      subs            r3,  r3,  #2
+        add             r0,  r0,  #4
+        popeq           {r4-r8,pc}
+
+        @ 1 channel
+4:      ldr             r4,  [r1],#4
+        tst             r2,  #8
+        mov             lr,  r2
+        mov             r5,  r0
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r4,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        bne             8f
+6:      subs            lr,  lr,  #16
+        vld1.64         {d4-d5},  [r4,:128]!
+        vcvt.s32.f32    q2,  q2,  #16
+        vld1.64         {d6-d7},  [r4,:128]!
+        vcvt.s32.f32    q3,  q3,  #16
+        vst1.16         {d0[1]},  [r5,:16], ip
+        vst1.16         {d0[3]},  [r5,:16], ip
+        vst1.16         {d1[1]},  [r5,:16], ip
+        vst1.16         {d1[3]},  [r5,:16], ip
+        vst1.16         {d2[1]},  [r5,:16], ip
+        vst1.16         {d2[3]},  [r5,:16], ip
+        vst1.16         {d3[1]},  [r5,:16], ip
+        vst1.16         {d3[3]},  [r5,:16], ip
+        beq             7f
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r4,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+7:      vst1.16         {d4[1]},  [r5,:16], ip
+        vst1.16         {d4[3]},  [r5,:16], ip
+        vst1.16         {d5[1]},  [r5,:16], ip
+        vst1.16         {d5[3]},  [r5,:16], ip
+        vst1.16         {d6[1]},  [r5,:16], ip
+        vst1.16         {d6[3]},  [r5,:16], ip
+        vst1.16         {d7[1]},  [r5,:16], ip
+        vst1.16         {d7[3]},  [r5,:16], ip
+        bgt             6b
+        pop             {r4-r8,pc}
+8:      subs            lr,  lr,  #8
+        vst1.16         {d0[1]},  [r5,:16], ip
+        vst1.16         {d0[3]},  [r5,:16], ip
+        vst1.16         {d1[1]},  [r5,:16], ip
+        vst1.16         {d1[3]},  [r5,:16], ip
+        vst1.16         {d2[1]},  [r5,:16], ip
+        vst1.16         {d2[3]},  [r5,:16], ip
+        vst1.16         {d3[1]},  [r5,:16], ip
+        vst1.16         {d3[3]},  [r5,:16], ip
+        popeq           {r4-r8,pc}
+        vld1.64         {d0-d1},  [r4,:128]!
+        vcvt.s32.f32    q0,  q0,  #16
+        vld1.64         {d2-d3},  [r4,:128]!
+        vcvt.s32.f32    q1,  q1,  #16
+        b               6b
+        .endfunc
diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S
new file mode 100644
index 0000000..04c8014
--- /dev/null
+++ b/libavcodec/arm/dsputil_vfp.S
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb at users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+        .fpu neon       @ required for gas to accept UAL syntax
+/*
+ * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
+ * throughput for almost all the instructions (except for double precision
+ * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
+ * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
+ * important for performance. One more interesting feature is that VFP has
+ * independent load/store and arithmetics pipelines, so it is possible to make
+ * them work simultaneously and get more than 1 operation per cycle. Load/store
+ * pipeline can process 2 single precision floating point values per cycle and
+ * supports bulk loads and stores for large sets of registers. Arithmetic operations
+ * can be done on vectors, which allows to keep the arithmetics pipeline busy,
+ * while the processor may issue and execute other instructions. Detailed
+ * optimization manuals can be found at http://www.arm.com
+ */
+
+/**
+ * ARM VFP optimized implementation of 'vector_fmul_c' function.
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_vfp(float *dst, const float *src, int len)
+function ff_vector_fmul_vfp, export=1
+        vpush           {d8-d15}
+        mov             r3,  r0
+        fmrx            r12, fpscr
+        orr             r12, r12, #(3 << 16) /* set vector size to 4 */
+        fmxr            fpscr, r12
+
+        vldmia          r3!, {s0-s3}
+        vldmia          r1!, {s8-s11}
+        vldmia          r3!, {s4-s7}
+        vldmia          r1!, {s12-s15}
+        vmul.f32        s8,  s0,  s8
+1:
+        subs            r2,  r2,  #16
+        vmul.f32        s12, s4,  s12
+        vldmiage        r3!, {s16-s19}
+        vldmiage        r1!, {s24-s27}
+        vldmiage        r3!, {s20-s23}
+        vldmiage        r1!, {s28-s31}
+        vmulge.f32      s24, s16, s24
+        vstmia          r0!, {s8-s11}
+        vstmia          r0!, {s12-s15}
+        vmulge.f32      s28, s20, s28
+        vldmiagt        r3!, {s0-s3}
+        vldmiagt        r1!, {s8-s11}
+        vldmiagt        r3!, {s4-s7}
+        vldmiagt        r1!, {s12-s15}
+        vmulge.f32      s8,  s0,  s8
+        vstmiage        r0!, {s24-s27}
+        vstmiage        r0!, {s28-s31}
+        bgt             1b
+
+        bic             r12, r12, #(7 << 16) /* set vector size back to 1 */
+        fmxr            fpscr, r12
+        vpop            {d8-d15}
+        bx              lr
+        .endfunc
+
+/**
+ * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+@                                 const float *src1, int len)
+function ff_vector_fmul_reverse_vfp, export=1
+        vpush           {d8-d15}
+        add             r2,  r2,  r3, lsl #2
+        vldmdb          r2!, {s0-s3}
+        vldmia          r1!, {s8-s11}
+        vldmdb          r2!, {s4-s7}
+        vldmia          r1!, {s12-s15}
+        vmul.f32        s8,  s3,  s8
+        vmul.f32        s9,  s2,  s9
+        vmul.f32        s10, s1,  s10
+        vmul.f32        s11, s0,  s11
+1:
+        subs            r3,  r3,  #16
+        vldmdbge        r2!, {s16-s19}
+        vmul.f32        s12, s7,  s12
+        vldmiage        r1!, {s24-s27}
+        vmul.f32        s13, s6,  s13
+        vldmdbge        r2!, {s20-s23}
+        vmul.f32        s14, s5,  s14
+        vldmiage        r1!, {s28-s31}
+        vmul.f32        s15, s4,  s15
+        vmulge.f32      s24, s19, s24
+        vldmdbgt        r2!, {s0-s3}
+        vmulge.f32      s25, s18, s25
+        vstmia          r0!, {s8-s13}
+        vmulge.f32      s26, s17, s26
+        vldmiagt        r1!, {s8-s11}
+        vmulge.f32      s27, s16, s27
+        vmulge.f32      s28, s23, s28
+        vldmdbgt        r2!, {s4-s7}
+        vmulge.f32      s29, s22, s29
+        vstmia          r0!, {s14-s15}
+        vmulge.f32      s30, s21, s30
+        vmulge.f32      s31, s20, s31
+        vmulge.f32      s8,  s3,  s8
+        vldmiagt        r1!, {s12-s15}
+        vmulge.f32      s9,  s2,  s9
+        vmulge.f32      s10, s1,  s10
+        vstmiage        r0!, {s24-s27}
+        vmulge.f32      s11, s0,  s11
+        vstmiage        r0!, {s28-s31}
+        bgt             1b
+
+        vpop            {d8-d15}
+        bx              lr
+        .endfunc
+
+#ifdef HAVE_ARMV6
+/**
+ * ARM VFP optimized float to int16 conversion.
+ * Assume that len is a positive number and is multiple of 8, destination
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for
+ * performance), little endian byte sex
+ */
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
+function ff_float_to_int16_vfp, export=1
+        push            {r4-r8,lr}
+        vpush           {d8-d11}
+        vldmia          r1!, {s16-s23}
+        vcvt.s32.f32    s0,  s16
+        vcvt.s32.f32    s1,  s17
+        vcvt.s32.f32    s2,  s18
+        vcvt.s32.f32    s3,  s19
+        vcvt.s32.f32    s4,  s20
+        vcvt.s32.f32    s5,  s21
+        vcvt.s32.f32    s6,  s22
+        vcvt.s32.f32    s7,  s23
+1:
+        subs            r2,  r2,  #8
+        vmov            r3,  r4,  s0, s1
+        vmov            r5,  r6,  s2, s3
+        vmov            r7,  r8,  s4, s5
+        vmov            ip,  lr,  s6, s7
+        vldmiagt        r1!, {s16-s23}
+        ssat            r4,  #16, r4
+        ssat            r3,  #16, r3
+        ssat            r6,  #16, r6
+        ssat            r5,  #16, r5
+        pkhbt           r3,  r3,  r4, lsl #16
+        pkhbt           r4,  r5,  r6, lsl #16
+        vcvtgt.s32.f32  s0,  s16
+        vcvtgt.s32.f32  s1,  s17
+        vcvtgt.s32.f32  s2,  s18
+        vcvtgt.s32.f32  s3,  s19
+        vcvtgt.s32.f32  s4,  s20
+        vcvtgt.s32.f32  s5,  s21
+        vcvtgt.s32.f32  s6,  s22
+        vcvtgt.s32.f32  s7,  s23
+        ssat            r8,  #16, r8
+        ssat            r7,  #16, r7
+        ssat            lr,  #16, lr
+        ssat            ip,  #16, ip
+        pkhbt           r5,  r7,  r8, lsl #16
+        pkhbt           r6,  ip,  lr, lsl #16
+        stmia           r0!, {r3-r6}
+        bgt             1b
+
+        vpop            {d8-d11}
+        pop             {r4-r8,pc}
+        .endfunc
+#endif
diff --git a/libavcodec/arm/float_arm_vfp.c b/libavcodec/arm/float_arm_vfp.c
new file mode 100644
index 0000000..5598aa9
--- /dev/null
+++ b/libavcodec/arm/float_arm_vfp.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb at users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+
+void ff_vector_fmul_vfp(float *dst, const float *src, int len);
+void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+                                const float *src1, int len);
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
+
+void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx)
+{
+    c->vector_fmul = ff_vector_fmul_vfp;
+    c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
+#ifdef HAVE_ARMV6
+    c->float_to_int16 = ff_float_to_int16_vfp;
+#endif
+}
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
new file mode 100644
index 0000000..39a8daf
--- /dev/null
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -0,0 +1,1377 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        .fpu neon
+
+        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
+        vtrn.32         \r0, \r4
+        vtrn.32         \r1, \r5
+        vtrn.32         \r2, \r6
+        vtrn.32         \r3, \r7
+        vtrn.16         \r0, \r2
+        vtrn.16         \r1, \r3
+        vtrn.16         \r4, \r6
+        vtrn.16         \r5, \r7
+        vtrn.8          \r0, \r1
+        vtrn.8          \r2, \r3
+        vtrn.8          \r4, \r5
+        vtrn.8          \r6, \r7
+        .endm
+
+        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
+        vswp            \r0, \r4
+        vswp            \r1, \r5
+        vswp            \r2, \r6
+        vswp            \r3, \r7
+        .endm
+
+        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
+        vtrn.32         \r0, \r2
+        vtrn.32         \r1, \r3
+        vtrn.32         \r4, \r6
+        vtrn.32         \r5, \r7
+        vtrn.16         \r0, \r1
+        vtrn.16         \r2, \r3
+        vtrn.16         \r4, \r5
+        vtrn.16         \r6, \r7
+        .endm
+
+/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+        .macro  h264_chroma_mc8 avg=0
+        push            {r4-r7, lr}
+        ldrd            r4,  [sp, #20]
+.if \avg
+        mov             lr,  r0
+.endif
+        pld             [r1]
+        pld             [r1, r2]
+
+        muls            r7,  r4,  r5
+        rsb             r6,  r7,  r5,  lsl #3
+        rsb             ip,  r7,  r4,  lsl #3
+        sub             r4,  r7,  r4,  lsl #3
+        sub             r4,  r4,  r5,  lsl #3
+        add             r4,  r4,  #64
+
+        beq             2f
+
+        add             r5,  r1,  r2
+
+        vdup.8          d0,  r4
+        lsl             r4,  r2,  #1
+        vdup.8          d1,  ip
+        vld1.64         {d4, d5}, [r1], r4
+        vdup.8          d2,  r6
+        vld1.64         {d6, d7}, [r5], r4
+        vdup.8          d3,  r7
+
+        vext.8          d5,  d4,  d5,  #1
+        vext.8          d7,  d6,  d7,  #1
+
+1:      pld             [r5]
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d5,  d1
+        vld1.64         {d4, d5}, [r1], r4
+        vmlal.u8        q8,  d6,  d2
+        vext.8          d5,  d4,  d5,  #1
+        vmlal.u8        q8,  d7,  d3
+        vmull.u8        q9,  d6,  d0
+        subs            r3,  r3,  #2
+        vmlal.u8        q9,  d7,  d1
+        vmlal.u8        q9,  d4,  d2
+        vmlal.u8        q9,  d5,  d3
+        vrshrn.u16      d16, q8,  #6
+        vld1.64         {d6, d7}, [r5], r4
+        pld             [r1]
+        vrshrn.u16      d17, q9,  #6
+.if \avg
+        vld1.64         {d20}, [lr,:64], r2
+        vld1.64         {d21}, [lr,:64], r2
+        vrhadd.u8       q8,  q8,  q10
+.endif
+        vext.8          d7,  d6,  d7,  #1
+        vst1.64         {d16}, [r0,:64], r2
+        vst1.64         {d17}, [r0,:64], r2
+        bgt             1b
+
+        pop             {r4-r7, pc}
+
+2:      tst             r6,  r6
+        add             ip,  ip,  r6
+        vdup.8          d0,  r4
+        vdup.8          d1,  ip
+
+        beq             4f
+
+        add             r5,  r1,  r2
+        lsl             r4,  r2,  #1
+        vld1.64         {d4}, [r1], r4
+        vld1.64         {d6}, [r5], r4
+
+3:      pld             [r5]
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d6,  d1
+        vld1.64         {d4}, [r1], r4
+        vmull.u8        q9,  d6,  d0
+        vmlal.u8        q9,  d4,  d1
+        vld1.64         {d6}, [r5], r4
+        vrshrn.u16      d16, q8,  #6
+        vrshrn.u16      d17, q9,  #6
+.if \avg
+        vld1.64         {d20}, [lr,:64], r2
+        vld1.64         {d21}, [lr,:64], r2
+        vrhadd.u8       q8,  q8,  q10
+.endif
+        subs            r3,  r3,  #2
+        pld             [r1]
+        vst1.64         {d16}, [r0,:64], r2
+        vst1.64         {d17}, [r0,:64], r2
+        bgt             3b
+
+        pop             {r4-r7, pc}
+
+4:      vld1.64         {d4, d5}, [r1], r2
+        vld1.64         {d6, d7}, [r1], r2
+        vext.8          d5,  d4,  d5,  #1
+        vext.8          d7,  d6,  d7,  #1
+
+5:      pld             [r1]
+        subs            r3,  r3,  #2
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d5,  d1
+        vld1.64         {d4, d5}, [r1], r2
+        vmull.u8        q9,  d6,  d0
+        vmlal.u8        q9,  d7,  d1
+        pld             [r1]
+        vext.8          d5,  d4,  d5,  #1
+        vrshrn.u16      d16, q8,  #6
+        vrshrn.u16      d17, q9,  #6
+.if \avg
+        vld1.64         {d20}, [lr,:64], r2
+        vld1.64         {d21}, [lr,:64], r2
+        vrhadd.u8       q8,  q8,  q10
+.endif
+        vld1.64         {d6, d7}, [r1], r2
+        vext.8          d7,  d6,  d7,  #1
+        vst1.64         {d16}, [r0,:64], r2
+        vst1.64         {d17}, [r0,:64], r2
+        bgt             5b
+
+        pop             {r4-r7, pc}
+        .endm
+
+/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+        .macro  h264_chroma_mc4 avg=0
+        push            {r4-r7, lr}
+        ldrd            r4,  [sp, #20]
+.if \avg
+        mov             lr,  r0
+.endif
+        pld             [r1]
+        pld             [r1, r2]
+
+        muls            r7,  r4,  r5
+        rsb             r6,  r7,  r5,  lsl #3
+        rsb             ip,  r7,  r4,  lsl #3
+        sub             r4,  r7,  r4,  lsl #3
+        sub             r4,  r4,  r5,  lsl #3
+        add             r4,  r4,  #64
+
+        beq             2f
+
+        add             r5,  r1,  r2
+
+        vdup.8          d0,  r4
+        lsl             r4,  r2,  #1
+        vdup.8          d1,  ip
+        vld1.64         {d4},     [r1], r4
+        vdup.8          d2,  r6
+        vld1.64         {d6},     [r5], r4
+        vdup.8          d3,  r7
+
+        vext.8          d5,  d4,  d5,  #1
+        vext.8          d7,  d6,  d7,  #1
+        vtrn.32         d4,  d5
+        vtrn.32         d6,  d7
+
+        vtrn.32         d0,  d1
+        vtrn.32         d2,  d3
+
+1:      pld             [r5]
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d6,  d2
+        vld1.64         {d4},     [r1], r4
+        vext.8          d5,  d4,  d5,  #1
+        vtrn.32         d4,  d5
+        vmull.u8        q9,  d6,  d0
+        vmlal.u8        q9,  d4,  d2
+        vld1.64         {d6},     [r5], r4
+        vadd.i16        d16, d16, d17
+        vadd.i16        d17, d18, d19
+        vrshrn.u16      d16, q8,  #6
+        subs            r3,  r3,  #2
+        pld             [r1]
+.if \avg
+        vld1.32         {d20[0]}, [lr,:32], r2
+        vld1.32         {d20[1]}, [lr,:32], r2
+        vrhadd.u8       d16, d16, d20
+.endif
+        vext.8          d7,  d6,  d7,  #1
+        vtrn.32         d6,  d7
+        vst1.32         {d16[0]}, [r0,:32], r2
+        vst1.32         {d16[1]}, [r0,:32], r2
+        bgt             1b
+
+        pop             {r4-r7, pc}
+
+2:      tst             r6,  r6
+        add             ip,  ip,  r6
+        vdup.8          d0,  r4
+        vdup.8          d1,  ip
+        vtrn.32         d0,  d1
+
+        beq             4f
+
+        vext.32         d1,  d0,  d1,  #1
+        add             r5,  r1,  r2
+        lsl             r4,  r2,  #1
+        vld1.32         {d4[0]},  [r1], r4
+        vld1.32         {d4[1]},  [r5], r4
+
+3:      pld             [r5]
+        vmull.u8        q8,  d4,  d0
+        vld1.32         {d4[0]},  [r1], r4
+        vmull.u8        q9,  d4,  d1
+        vld1.32         {d4[1]},  [r5], r4
+        vadd.i16        d16, d16, d17
+        vadd.i16        d17, d18, d19
+        vrshrn.u16      d16, q8,  #6
+.if \avg
+        vld1.32         {d20[0]}, [lr,:32], r2
+        vld1.32         {d20[1]}, [lr,:32], r2
+        vrhadd.u8       d16, d16, d20
+.endif
+        subs            r3,  r3,  #2
+        pld             [r1]
+        vst1.32         {d16[0]}, [r0,:32], r2
+        vst1.32         {d16[1]}, [r0,:32], r2
+        bgt             3b
+
+        pop             {r4-r7, pc}
+
+4:      vld1.64         {d4},     [r1], r2
+        vld1.64         {d6},     [r1], r2
+        vext.8          d5,  d4,  d5,  #1
+        vext.8          d7,  d6,  d7,  #1
+        vtrn.32         d4,  d5
+        vtrn.32         d6,  d7
+
+5:      vmull.u8        q8,  d4,  d0
+        vmull.u8        q9,  d6,  d0
+        subs            r3,  r3,  #2
+        vld1.64         {d4},     [r1], r2
+        vext.8          d5,  d4,  d5,  #1
+        vtrn.32         d4,  d5
+        vadd.i16        d16, d16, d17
+        vadd.i16        d17, d18, d19
+        pld             [r1]
+        vrshrn.u16      d16, q8,  #6
+.if \avg
+        vld1.32         {d20[0]}, [lr,:32], r2
+        vld1.32         {d20[1]}, [lr,:32], r2
+        vrhadd.u8       d16, d16, d20
+.endif
+        vld1.64         {d6},     [r1], r2
+        vext.8          d7,  d6,  d7,  #1
+        vtrn.32         d6,  d7
+        pld             [r1]
+        vst1.32         {d16[0]}, [r0,:32], r2
+        vst1.32         {d16[1]}, [r0,:32], r2
+        bgt             5b
+
+        pop             {r4-r7, pc}
+        .endm
+
+        .text
+        .align
+
+function ff_put_h264_chroma_mc8_neon, export=1
+        h264_chroma_mc8
+        .endfunc
+
+function ff_avg_h264_chroma_mc8_neon, export=1
+        h264_chroma_mc8 avg=1
+        .endfunc
+
+function ff_put_h264_chroma_mc4_neon, export=1
+        h264_chroma_mc4
+        .endfunc
+
+function ff_avg_h264_chroma_mc4_neon, export=1
+        h264_chroma_mc4 avg=1
+        .endfunc
+
+        /* H.264 loop filter */
+
+        .macro h264_loop_filter_start
+        ldr             ip,  [sp]
+        tst             r2,  r2
+        ldr             ip,  [ip]
+        tstne           r3,  r3
+        vmov.32         d24[0], ip
+        and             ip,  ip,  ip, lsl #16
+        bxeq            lr
+        ands            ip,  ip,  ip, lsl #8
+        bxlt            lr
+        .endm
+
+        .macro align_push_regs
+        and             ip,  sp,  #15
+        add             ip,  ip,  #32
+        sub             sp,  sp,  ip
+        vst1.64         {d12-d15}, [sp,:128]
+        sub             sp,  sp,  #32
+        vst1.64         {d8-d11},  [sp,:128]
+        .endm
+
+        .macro align_pop_regs
+        vld1.64         {d8-d11},  [sp,:128]!
+        vld1.64         {d12-d15}, [sp,:128], ip
+        .endm
+
+        .macro h264_loop_filter_luma
+        vdup.8          q11, r2         @ alpha
+        vmovl.u8        q12, d24
+        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
+        vmovl.u16       q12, d24
+        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
+        vsli.16         q12, q12, #8
+        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
+        vsli.32         q12, q12, #16
+        vclt.u8         q6,  q6,  q11   @ < alpha
+        vdup.8          q11, r3         @ beta
+        vclt.s8         q7,  q12, #0
+        vclt.u8         q14, q14, q11   @ < beta
+        vclt.u8         q15, q15, q11   @ < beta
+        vbic            q6,  q6,  q7
+        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
+        vand            q6,  q6,  q14
+        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
+        vclt.u8         q4,  q4,  q11   @ < beta
+        vand            q6,  q6,  q15
+        vclt.u8         q5,  q5,  q11   @ < beta
+        vand            q4,  q4,  q6
+        vand            q5,  q5,  q6
+        vand            q12, q12, q6
+        vrhadd.u8       q14, q8,  q0
+        vsub.i8         q6,  q12, q4
+        vqadd.u8        q7,  q9,  q12
+        vhadd.u8        q10, q10, q14
+        vsub.i8         q6,  q6,  q5
+        vhadd.u8        q14, q2,  q14
+        vmin.u8         q7,  q7,  q10
+        vqsub.u8        q11, q9,  q12
+        vqadd.u8        q2,  q1,  q12
+        vmax.u8         q7,  q7,  q11
+        vqsub.u8        q11, q1,  q12
+        vmin.u8         q14, q2,  q14
+        vmovl.u8        q2,  d0
+        vmax.u8         q14, q14, q11
+        vmovl.u8        q10, d1
+        vsubw.u8        q2,  q2,  d16
+        vsubw.u8        q10, q10, d17
+        vshl.i16        q2,  q2,  #2
+        vshl.i16        q10, q10, #2
+        vaddw.u8        q2,  q2,  d18
+        vaddw.u8        q10, q10, d19
+        vsubw.u8        q2,  q2,  d2
+        vsubw.u8        q10, q10, d3
+        vrshrn.i16      d4,  q2,  #3
+        vrshrn.i16      d5,  q10, #3
+        vbsl            q4,  q7,  q9
+        vbsl            q5,  q14, q1
+        vneg.s8         q7,  q6
+        vmovl.u8        q14, d16
+        vmin.s8         q2,  q2,  q6
+        vmovl.u8        q6,  d17
+        vmax.s8         q2,  q2,  q7
+        vmovl.u8        q11, d0
+        vmovl.u8        q12, d1
+        vaddw.s8        q14, q14, d4
+        vaddw.s8        q6,  q6,  d5
+        vsubw.s8        q11, q11, d4
+        vsubw.s8        q12, q12, d5
+        vqmovun.s16     d16, q14
+        vqmovun.s16     d17, q6
+        vqmovun.s16     d0,  q11
+        vqmovun.s16     d1,  q12
+        .endm
+
+function ff_h264_v_loop_filter_luma_neon, export=1
+        h264_loop_filter_start
+
+        vld1.64         {d0, d1},  [r0,:128], r1
+        vld1.64         {d2, d3},  [r0,:128], r1
+        vld1.64         {d4, d5},  [r0,:128], r1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r0,  r0,  r1, lsl #1
+        vld1.64         {d20,d21}, [r0,:128], r1
+        vld1.64         {d18,d19}, [r0,:128], r1
+        vld1.64         {d16,d17}, [r0,:128], r1
+
+        align_push_regs
+
+        h264_loop_filter_luma
+
+        sub             r0,  r0,  r1, lsl #1
+        vst1.64         {d8, d9},  [r0,:128], r1
+        vst1.64         {d16,d17}, [r0,:128], r1
+        vst1.64         {d0, d1},  [r0,:128], r1
+        vst1.64         {d10,d11}, [r0,:128]
+
+        align_pop_regs
+        bx              lr
+        .endfunc
+
+function ff_h264_h_loop_filter_luma_neon, export=1
+        h264_loop_filter_start
+
+        sub             r0,  r0,  #4
+        vld1.64         {d6},  [r0], r1
+        vld1.64         {d20}, [r0], r1
+        vld1.64         {d18}, [r0], r1
+        vld1.64         {d16}, [r0], r1
+        vld1.64         {d0},  [r0], r1
+        vld1.64         {d2},  [r0], r1
+        vld1.64         {d4},  [r0], r1
+        vld1.64         {d26}, [r0], r1
+        vld1.64         {d7},  [r0], r1
+        vld1.64         {d21}, [r0], r1
+        vld1.64         {d19}, [r0], r1
+        vld1.64         {d17}, [r0], r1
+        vld1.64         {d1},  [r0], r1
+        vld1.64         {d3},  [r0], r1
+        vld1.64         {d5},  [r0], r1
+        vld1.64         {d27}, [r0], r1
+
+        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
+
+        align_push_regs
+        sub             sp,  sp,  #16
+        vst1.64         {d4, d5},  [sp,:128]
+        sub             sp,  sp,  #16
+        vst1.64         {d20,d21}, [sp,:128]
+
+        h264_loop_filter_luma
+
+        vld1.64         {d20,d21}, [sp,:128]!
+        vld1.64         {d4, d5},  [sp,:128]!
+
+        transpose_8x8   q3, q10, q4, q8, q0, q5, q2, q13
+
+        sub             r0,  r0,  r1, lsl #4
+        vst1.64         {d6},  [r0], r1
+        vst1.64         {d20}, [r0], r1
+        vst1.64         {d8},  [r0], r1
+        vst1.64         {d16}, [r0], r1
+        vst1.64         {d0},  [r0], r1
+        vst1.64         {d10}, [r0], r1
+        vst1.64         {d4},  [r0], r1
+        vst1.64         {d26}, [r0], r1
+        vst1.64         {d7},  [r0], r1
+        vst1.64         {d21}, [r0], r1
+        vst1.64         {d9},  [r0], r1
+        vst1.64         {d17}, [r0], r1
+        vst1.64         {d1},  [r0], r1
+        vst1.64         {d11}, [r0], r1
+        vst1.64         {d5},  [r0], r1
+        vst1.64         {d27}, [r0], r1
+
+        align_pop_regs
+        bx              lr
+        .endfunc
+
+        .macro h264_loop_filter_chroma
+        vdup.8          d22, r2         @ alpha
+        vmovl.u8        q12, d24
+        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
+        vmovl.u8        q2,  d0
+        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
+        vsubw.u8        q2,  q2,  d16
+        vsli.16         d24, d24, #8
+        vshl.i16        q2,  q2,  #2
+        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
+        vaddw.u8        q2,  q2,  d18
+        vclt.u8         d26, d26, d22   @ < alpha
+        vsubw.u8        q2,  q2,  d2
+        vdup.8          d22, r3         @ beta
+        vclt.s8         d25, d24, #0
+        vrshrn.i16      d4,  q2,  #3
+        vclt.u8         d28, d28, d22   @ < beta
+        vbic            d26, d26, d25
+        vclt.u8         d30, d30, d22   @ < beta
+        vand            d26, d26, d28
+        vneg.s8         d25, d24
+        vand            d26, d26, d30
+        vmin.s8         d4,  d4,  d24
+        vmovl.u8        q14, d16
+        vand            d4,  d4,  d26
+        vmax.s8         d4,  d4,  d25
+        vmovl.u8        q11, d0
+        vaddw.s8        q14, q14, d4
+        vsubw.s8        q11, q11, d4
+        vqmovun.s16     d16, q14
+        vqmovun.s16     d0,  q11
+        .endm
+
+function ff_h264_v_loop_filter_chroma_neon, export=1
+        h264_loop_filter_start
+
+        sub             r0,  r0,  r1, lsl #1
+        vld1.64         {d18}, [r0,:64], r1
+        vld1.64         {d16}, [r0,:64], r1
+        vld1.64         {d0},  [r0,:64], r1
+        vld1.64         {d2},  [r0,:64]
+
+        h264_loop_filter_chroma
+
+        sub             r0,  r0,  r1, lsl #1
+        vst1.64         {d16}, [r0,:64], r1
+        vst1.64         {d0},  [r0,:64], r1
+
+        bx              lr
+        .endfunc
+
+function ff_h264_h_loop_filter_chroma_neon, export=1
+        h264_loop_filter_start
+
+        sub             r0,  r0,  #2
+        vld1.32         {d18[0]}, [r0], r1
+        vld1.32         {d16[0]}, [r0], r1
+        vld1.32         {d0[0]},  [r0], r1
+        vld1.32         {d2[0]},  [r0], r1
+        vld1.32         {d18[1]}, [r0], r1
+        vld1.32         {d16[1]}, [r0], r1
+        vld1.32         {d0[1]},  [r0], r1
+        vld1.32         {d2[1]},  [r0], r1
+
+        vtrn.16         d18, d0
+        vtrn.16         d16, d2
+        vtrn.8          d18, d16
+        vtrn.8          d0,  d2
+
+        h264_loop_filter_chroma
+
+        vtrn.16         d18, d0
+        vtrn.16         d16, d2
+        vtrn.8          d18, d16
+        vtrn.8          d0,  d2
+
+        sub             r0,  r0,  r1, lsl #3
+        vst1.32         {d18[0]}, [r0], r1
+        vst1.32         {d16[0]}, [r0], r1
+        vst1.32         {d0[0]},  [r0], r1
+        vst1.32         {d2[0]},  [r0], r1
+        vst1.32         {d18[1]}, [r0], r1
+        vst1.32         {d16[1]}, [r0], r1
+        vst1.32         {d0[1]},  [r0], r1
+        vst1.32         {d2[1]},  [r0], r1
+
+        bx              lr
+        .endfunc
+
+        /* H.264 qpel MC */
+
+        .macro  lowpass_const r
+        movw            \r,  #5
+        movt            \r,  #20
+        vmov.32         d6[0], \r
+        .endm
+
+        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
+.if \narrow
+        t0 .req q0
+        t1 .req q8
+.else
+        t0 .req \d0
+        t1 .req \d1
+.endif
+        vext.8          d2,  \r0, \r1, #2
+        vext.8          d3,  \r0, \r1, #3
+        vaddl.u8        q1,  d2,  d3
+        vext.8          d4,  \r0, \r1, #1
+        vext.8          d5,  \r0, \r1, #4
+        vaddl.u8        q2,  d4,  d5
+        vext.8          d30, \r0, \r1, #5
+        vaddl.u8        t0,  \r0, d30
+        vext.8          d18, \r2, \r3, #2
+        vmla.i16        t0,  q1,  d6[1]
+        vext.8          d19, \r2, \r3, #3
+        vaddl.u8        q9,  d18, d19
+        vext.8          d20, \r2, \r3, #1
+        vmls.i16        t0,  q2,  d6[0]
+        vext.8          d21, \r2, \r3, #4
+        vaddl.u8        q10, d20, d21
+        vext.8          d31, \r2, \r3, #5
+        vaddl.u8        t1,  \r2, d31
+        vmla.i16        t1,  q9,  d6[1]
+        vmls.i16        t1,  q10, d6[0]
+.if \narrow
+        vqrshrun.s16    \d0, t0,  #5
+        vqrshrun.s16    \d1, t1,  #5
+.endif
+        .unreq  t0
+        .unreq  t1
+        .endm
+
+        .macro  lowpass_8_1 r0, r1, d0, narrow=1
+.if \narrow
+        t0 .req q0
+.else
+        t0 .req \d0
+.endif
+        vext.8          d2,  \r0, \r1, #2
+        vext.8          d3,  \r0, \r1, #3
+        vaddl.u8        q1,  d2,  d3
+        vext.8          d4,  \r0, \r1, #1
+        vext.8          d5,  \r0, \r1, #4
+        vaddl.u8        q2,  d4,  d5
+        vext.8          d30, \r0, \r1, #5
+        vaddl.u8        t0,  \r0, d30
+        vmla.i16        t0,  q1,  d6[1]
+        vmls.i16        t0,  q2,  d6[0]
+.if \narrow
+        vqrshrun.s16    \d0, t0,  #5
+.endif
+        .unreq  t0
+        .endm
+
+        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
+        vext.16         q1,  \r0, \r1, #2
+        vext.16         q0,  \r0, \r1, #3
+        vaddl.s16       q9,  d2,  d0
+        vext.16         q2,  \r0, \r1, #1
+        vaddl.s16       q1,  d3,  d1
+        vext.16         q3,  \r0, \r1, #4
+        vaddl.s16       q10, d4,  d6
+        vext.16         \r1, \r0, \r1, #5
+        vaddl.s16       q2,  d5,  d7
+        vaddl.s16       q0,  \h0, \h1
+        vaddl.s16       q8,  \l0, \l1
+
+        vshl.i32        q3,  q9,  #4
+        vshl.i32        q9,  q9,  #2
+        vshl.i32        q15, q10, #2
+        vadd.i32        q9,  q9,  q3
+        vadd.i32        q10, q10, q15
+
+        vshl.i32        q3,  q1,  #4
+        vshl.i32        q1,  q1,  #2
+        vshl.i32        q15, q2,  #2
+        vadd.i32        q1,  q1,  q3
+        vadd.i32        q2,  q2,  q15
+
+        vadd.i32        q9,  q9,  q8
+        vsub.i32        q9,  q9,  q10
+
+        vadd.i32        q1,  q1,  q0
+        vsub.i32        q1,  q1,  q2
+
+        vrshrn.s32      d18, q9,  #10
+        vrshrn.s32      d19, q1,  #10
+
+        vqmovun.s16     \d,  q9
+        .endm
+
+function put_h264_qpel16_h_lowpass_neon_packed
+        mov             r4,  lr
+        mov             ip,  #16
+        mov             r3,  #8
+        bl              put_h264_qpel8_h_lowpass_neon
+        sub             r1,  r1,  r2, lsl #4
+        add             r1,  r1,  #8
+        mov             ip,  #16
+        mov             lr,  r4
+        b               put_h264_qpel8_h_lowpass_neon
+        .endfunc
+
+function put_h264_qpel16_h_lowpass_neon
+        push            {lr}
+        mov             ip,  #16
+        bl              put_h264_qpel8_h_lowpass_neon
+        sub             r0,  r0,  r3, lsl #4
+        sub             r1,  r1,  r2, lsl #4
+        add             r0,  r0,  #8
+        add             r1,  r1,  #8
+        mov             ip,  #16
+        pop             {lr}
+        .endfunc
+
+function put_h264_qpel8_h_lowpass_neon
+1:      vld1.64         {d0, d1},  [r1], r2
+        vld1.64         {d16,d17}, [r1], r2
+        subs            ip,  ip,  #2
+        lowpass_8       d0,  d1,  d16, d17, d0,  d16
+        vst1.64         {d0},     [r0,:64], r3
+        vst1.64         {d16},    [r0,:64], r3
+        bne             1b
+        bx              lr
+        .endfunc
+
+function put_h264_qpel16_h_lowpass_l2_neon
+        push            {lr}
+        mov             ip,  #16
+        bl              put_h264_qpel8_h_lowpass_l2_neon
+        sub             r0,  r0,  r2, lsl #4
+        sub             r1,  r1,  r2, lsl #4
+        sub             r3,  r3,  r2, lsl #4
+        add             r0,  r0,  #8
+        add             r1,  r1,  #8
+        add             r3,  r3,  #8
+        mov             ip,  #16
+        pop             {lr}
+        .endfunc
+
+function put_h264_qpel8_h_lowpass_l2_neon
+1:      vld1.64         {d0, d1},  [r1], r2
+        vld1.64         {d16,d17}, [r1], r2
+        vld1.64         {d28},     [r3], r2
+        vld1.64         {d29},     [r3], r2
+        subs            ip,  ip,  #2
+        lowpass_8       d0,  d1,  d16, d17, d0,  d1
+        vrhadd.u8       q0,  q0,  q14
+        vst1.64         {d0},      [r0,:64], r2
+        vst1.64         {d1},      [r0,:64], r2
+        bne             1b
+        bx              lr
+        .endfunc
+
+function put_h264_qpel16_v_lowpass_neon_packed
+        mov             r4,  lr
+        mov             r2,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             r1,  r1,  r3, lsl #4
+        sub             r1,  r1,  r3, lsl #2
+        add             r1,  r1,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        mov             lr,  r4
+        b               put_h264_qpel8_v_lowpass_neon
+        .endfunc
+
+function put_h264_qpel16_v_lowpass_neon
+        mov             r4,  lr
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             r0,  r0,  r2, lsl #4
+        add             r0,  r0,  #8
+        sub             r1,  r1,  r3, lsl #4
+        sub             r1,  r1,  r3, lsl #2
+        add             r1,  r1,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        mov             lr,  r4
+        .endfunc
+
+function put_h264_qpel8_v_lowpass_neon
+        vld1.64         {d8},  [r1], r3
+        vld1.64         {d10}, [r1], r3
+        vld1.64         {d12}, [r1], r3
+        vld1.64         {d14}, [r1], r3
+        vld1.64         {d22}, [r1], r3
+        vld1.64         {d24}, [r1], r3
+        vld1.64         {d26}, [r1], r3
+        vld1.64         {d28}, [r1], r3
+        vld1.64         {d9},  [r1], r3
+        vld1.64         {d11}, [r1], r3
+        vld1.64         {d13}, [r1], r3
+        vld1.64         {d15}, [r1], r3
+        vld1.64         {d23}, [r1]
+
+        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
+        lowpass_8       d8,  d9,  d10, d11, d8,  d10
+        lowpass_8       d12, d13, d14, d15, d12, d14
+        lowpass_8       d22, d23, d24, d25, d22, d24
+        lowpass_8       d26, d27, d28, d29, d26, d28
+        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
+
+        vst1.64         {d8},  [r0,:64], r2
+        vst1.64         {d10}, [r0,:64], r2
+        vst1.64         {d12}, [r0,:64], r2
+        vst1.64         {d14}, [r0,:64], r2
+        vst1.64         {d22}, [r0,:64], r2
+        vst1.64         {d24}, [r0,:64], r2
+        vst1.64         {d26}, [r0,:64], r2
+        vst1.64         {d28}, [r0,:64], r2
+
+        bx              lr
+        .endfunc
+
+function put_h264_qpel16_v_lowpass_l2_neon
+        mov             r4,  lr
+        bl              put_h264_qpel8_v_lowpass_l2_neon
+        sub             r1,  r1,  r3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_l2_neon
+        sub             r0,  r0,  r3, lsl #4
+        sub             ip,  ip,  r2, lsl #4
+        add             r0,  r0,  #8
+        add             ip,  ip,  #8
+        sub             r1,  r1,  r3, lsl #4
+        sub             r1,  r1,  r3, lsl #2
+        add             r1,  r1,  #8
+        bl              put_h264_qpel8_v_lowpass_l2_neon
+        sub             r1,  r1,  r3, lsl #2
+        mov             lr,  r4
+        .endfunc
+
+function put_h264_qpel8_v_lowpass_l2_neon
+        vld1.64         {d8},  [r1], r3
+        vld1.64         {d10}, [r1], r3
+        vld1.64         {d12}, [r1], r3
+        vld1.64         {d14}, [r1], r3
+        vld1.64         {d22}, [r1], r3
+        vld1.64         {d24}, [r1], r3
+        vld1.64         {d26}, [r1], r3
+        vld1.64         {d28}, [r1], r3
+        vld1.64         {d9},  [r1], r3
+        vld1.64         {d11}, [r1], r3
+        vld1.64         {d13}, [r1], r3
+        vld1.64         {d15}, [r1], r3
+        vld1.64         {d23}, [r1]
+
+        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
+        lowpass_8       d8,  d9,  d10, d11, d8,  d9
+        lowpass_8       d12, d13, d14, d15, d12, d13
+        lowpass_8       d22, d23, d24, d25, d22, d23
+        lowpass_8       d26, d27, d28, d29, d26, d27
+        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
+
+        vld1.64         {d0},  [ip], r2
+        vld1.64         {d1},  [ip], r2
+        vld1.64         {d2},  [ip], r2
+        vld1.64         {d3},  [ip], r2
+        vld1.64         {d4},  [ip], r2
+        vrhadd.u8       q0,  q0,  q4
+        vld1.64         {d5},  [ip], r2
+        vrhadd.u8       q1,  q1,  q6
+        vld1.64         {d10}, [ip], r2
+        vrhadd.u8       q2,  q2,  q11
+        vld1.64         {d11}, [ip], r2
+
+        vst1.64         {d0},  [r0,:64], r3
+        vst1.64         {d1},  [r0,:64], r3
+        vrhadd.u8       q5,  q5,  q13
+        vst1.64         {d2},  [r0,:64], r3
+        vst1.64         {d3},  [r0,:64], r3
+        vst1.64         {d4},  [r0,:64], r3
+        vst1.64         {d5},  [r0,:64], r3
+        vst1.64         {d10}, [r0,:64], r3
+        vst1.64         {d11}, [r0,:64], r3
+
+        bx              lr
+        .endfunc
+
+function put_h264_qpel8_hv_lowpass_neon_top
+        lowpass_const   ip
+        mov             ip,  #12
+1:      vld1.64         {d0, d1},  [r1], r3
+        vld1.64         {d16,d17}, [r1], r3
+        subs            ip,  ip,  #2
+        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
+        vst1.64         {d22-d25}, [r4,:128]!
+        bne             1b
+
+        vld1.64         {d0, d1},  [r1]
+        lowpass_8_1     d0,  d1,  q12, narrow=0
+
+        mov             ip,  #-16
+        add             r4,  r4,  ip
+        vld1.64         {d30,d31}, [r4,:128], ip
+        vld1.64         {d20,d21}, [r4,:128], ip
+        vld1.64         {d18,d19}, [r4,:128], ip
+        vld1.64         {d16,d17}, [r4,:128], ip
+        vld1.64         {d14,d15}, [r4,:128], ip
+        vld1.64         {d12,d13}, [r4,:128], ip
+        vld1.64         {d10,d11}, [r4,:128], ip
+        vld1.64         {d8, d9},  [r4,:128], ip
+        vld1.64         {d6, d7},  [r4,:128], ip
+        vld1.64         {d4, d5},  [r4,:128], ip
+        vld1.64         {d2, d3},  [r4,:128], ip
+        vld1.64         {d0, d1},  [r4,:128]
+
+        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
+        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
+
+        swap4           d17, d19, d21, d31, d24, d26, d28, d22
+        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
+
+        vst1.64         {d30,d31}, [r4,:128]!
+        vst1.64         {d6, d7},  [r4,:128]!
+        vst1.64         {d20,d21}, [r4,:128]!
+        vst1.64         {d4, d5},  [r4,:128]!
+        vst1.64         {d18,d19}, [r4,:128]!
+        vst1.64         {d2, d3},  [r4,:128]!
+        vst1.64         {d16,d17}, [r4,:128]!
+        vst1.64         {d0, d1},  [r4,:128]
+
+        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
+        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
+        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
+        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
+
+        vld1.64         {d16,d17}, [r4,:128], ip
+        vld1.64         {d30,d31}, [r4,:128], ip
+        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
+        vld1.64         {d16,d17}, [r4,:128], ip
+        vld1.64         {d30,d31}, [r4,:128], ip
+        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
+        vld1.64         {d16,d17}, [r4,:128], ip
+        vld1.64         {d30,d31}, [r4,:128], ip
+        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
+        vld1.64         {d16,d17}, [r4,:128], ip
+        vld1.64         {d30,d31}, [r4,:128]
+        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
+
+        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
+
+        bx              lr
+        .endfunc
+
+function put_h264_qpel8_hv_lowpass_neon
+        mov             r10, lr
+        bl              put_h264_qpel8_hv_lowpass_neon_top
+        vst1.64         {d12},     [r0,:64], r2
+        vst1.64         {d13},     [r0,:64], r2
+        vst1.64         {d14},     [r0,:64], r2
+        vst1.64         {d15},     [r0,:64], r2
+        vst1.64         {d8},      [r0,:64], r2
+        vst1.64         {d9},      [r0,:64], r2
+        vst1.64         {d10},     [r0,:64], r2
+        vst1.64         {d11},     [r0,:64], r2
+
+        mov             lr,  r10
+        bx              lr
+        .endfunc
+
+function put_h264_qpel8_hv_lowpass_l2_neon
+        mov             r10, lr
+        bl              put_h264_qpel8_hv_lowpass_neon_top
+
+        vld1.64         {d0, d1},  [r2,:128]!
+        vld1.64         {d2, d3},  [r2,:128]!
+        vrhadd.u8       q0,  q0,  q6
+        vld1.64         {d4, d5},  [r2,:128]!
+        vrhadd.u8       q1,  q1,  q7
+        vld1.64         {d6, d7},  [r2,:128]!
+        vrhadd.u8       q2,  q2,  q4
+
+        vst1.64         {d0},      [r0,:64], r3
+        vrhadd.u8       q3,  q3,  q5
+        vst1.64         {d1},      [r0,:64], r3
+        vst1.64         {d2},      [r0,:64], r3
+        vst1.64         {d3},      [r0,:64], r3
+        vst1.64         {d4},      [r0,:64], r3
+        vst1.64         {d5},      [r0,:64], r3
+        vst1.64         {d6},      [r0,:64], r3
+        vst1.64         {d7},      [r0,:64], r3
+
+        mov             lr,  r10
+        bx              lr
+        .endfunc
+
+function put_h264_qpel16_hv_lowpass_neon
+        mov             r9,  lr
+        bl              put_h264_qpel8_hv_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        bl              put_h264_qpel8_hv_lowpass_neon
+        sub             r1,  r1,  r3, lsl #4
+        sub             r1,  r1,  r3, lsl #2
+        add             r1,  r1,  #8
+        sub             r0,  r0,  r2, lsl #4
+        add             r0,  r0,  #8
+        bl              put_h264_qpel8_hv_lowpass_neon
+        sub             r1,  r1,  r3, lsl #2
+        mov             lr,  r9
+        b               put_h264_qpel8_hv_lowpass_neon
+        .endfunc
+
+function put_h264_qpel16_hv_lowpass_l2_neon
+        mov             r9,  lr
+        sub             r2,  r4,  #256
+        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        sub             r1,  r1,  r3, lsl #2
+        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        sub             r1,  r1,  r3, lsl #4
+        sub             r1,  r1,  r3, lsl #2
+        add             r1,  r1,  #8
+        sub             r0,  r0,  r3, lsl #4
+        add             r0,  r0,  #8
+        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        sub             r1,  r1,  r3, lsl #2
+        mov             lr,  r9
+        b               put_h264_qpel8_hv_lowpass_l2_neon
+        .endfunc
+
+function ff_put_h264_qpel8_mc10_neon, export=1
+        lowpass_const   r3
+        mov             r3,  r1
+        sub             r1,  r1,  #2
+        mov             ip,  #8
+        b               put_h264_qpel8_h_lowpass_l2_neon
+        .endfunc
+
+function ff_put_h264_qpel8_mc20_neon, export=1
+        lowpass_const   r3
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        mov             ip,  #8
+        b               put_h264_qpel8_h_lowpass_neon
+        .endfunc
+
+function ff_put_h264_qpel8_mc30_neon, export=1
+        lowpass_const   r3
+        add             r3,  r1,  #1
+        sub             r1,  r1,  #2
+        mov             ip,  #8
+        b               put_h264_qpel8_h_lowpass_l2_neon
+        .endfunc
+
+function ff_put_h264_qpel8_mc01_neon, export=1
+        push            {lr}
+        mov             ip,  r1
+put_h264_qpel8_mc01:
+        lowpass_const   r3
+        mov             r3,  r2
+        sub             r1,  r1,  r2, lsl #1
+        vpush           {d8-d15}
+        bl              put_h264_qpel8_v_lowpass_l2_neon
+        vpop            {d8-d15}
+        pop             {pc}
+        .endfunc
+
+function ff_put_h264_qpel8_mc11_neon, export=1
+        push            {r0, r1, r2, lr}
+put_h264_qpel8_mc11:
+        lowpass_const   r3
+        sub             sp,  sp,  #64
+        mov             r0,  sp
+        sub             r1,  r1,  #2
+        mov             r3,  #8
+        mov             ip,  #8
+        vpush           {d8-d15}
+        bl              put_h264_qpel8_h_lowpass_neon
+        ldrd            r0,  [sp, #128]
+        mov             r3,  r2
+        add             ip,  sp,  #64
+        sub             r1,  r1,  r2, lsl #1
+        mov             r2,  #8
+        bl              put_h264_qpel8_v_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  sp,  #76
+        pop             {pc}
+        .endfunc
+
+function ff_put_h264_qpel8_mc21_neon, export=1
+        push            {r0, r1, r4, r10, r11, lr}
+put_h264_qpel8_mc21:
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             sp,  sp,  #(8*8+16*12)
+        sub             r1,  r1,  #2
+        mov             r3,  #8
+        mov             r0,  sp
+        mov             ip,  #8
+        vpush           {d8-d15}
+        bl              put_h264_qpel8_h_lowpass_neon
+        mov             r4,  r0
+        ldrd            r0,  [r11]
+        sub             r1,  r1,  r2, lsl #1
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        sub             r2,  r4,  #64
+        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  r11,  #8
+        pop             {r4, r10, r11, pc}
+        .endfunc
+
+function ff_put_h264_qpel8_mc31_neon, export=1
+        add             r1,  r1,  #1
+        push            {r0, r1, r2, lr}
+        sub             r1,  r1,  #1
+        b               put_h264_qpel8_mc11
+        .endfunc
+
+function ff_put_h264_qpel8_mc02_neon, export=1
+        push            {lr}
+        lowpass_const   r3
+        sub             r1,  r1,  r2, lsl #1
+        mov             r3,  r2
+        vpush           {d8-d15}
+        bl              put_h264_qpel8_v_lowpass_neon
+        vpop            {d8-d15}
+        pop             {pc}
+        .endfunc
+
+function ff_put_h264_qpel8_mc12_neon, export=1
+        push            {r0, r1, r4, r10, r11, lr}
+put_h264_qpel8_mc12:
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             sp,  sp,  #(8*8+16*12)
+        sub             r1,  r1,  r2, lsl #1
+        mov             r3,  r2
+        mov             r2,  #8
+        mov             r0,  sp
+        vpush           {d8-d15}
+        bl              put_h264_qpel8_v_lowpass_neon
+        mov             r4,  r0
+        ldrd            r0,  [r11]
+        sub             r1,  r1,  r3, lsl #1
+        sub             r1,  r1,  #2
+        sub             r2,  r4,  #64
+        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  r11,  #8
+        pop             {r4, r10, r11, pc}
+        .endfunc
+
+function ff_put_h264_qpel8_mc22_neon, export=1
+        push            {r4, r10, r11, lr}
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             r1,  r1,  r2, lsl #1
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        sub             sp,  sp,  #(16*12)
+        mov             r4,  sp
+        vpush           {d8-d15}
+        bl              put_h264_qpel8_hv_lowpass_neon
+        vpop            {d8-d15}
+        mov             sp,  r11
+        pop             {r4, r10, r11, pc}
+        .endfunc
+
+function ff_put_h264_qpel8_mc32_neon, export=1
+        push            {r0, r1, r4, r10, r11, lr}
+        add             r1,  r1,  #1
+        b               put_h264_qpel8_mc12
+        .endfunc
+
+function ff_put_h264_qpel8_mc03_neon, export=1
+        push            {lr}
+        add             ip,  r1,  r2
+        b               put_h264_qpel8_mc01
+        .endfunc
+
+function ff_put_h264_qpel8_mc13_neon, export=1
+        push            {r0, r1, r2, lr}
+        add             r1,  r1,  r2
+        b               put_h264_qpel8_mc11
+        .endfunc
+
+function ff_put_h264_qpel8_mc23_neon, export=1
+        push            {r0, r1, r4, r10, r11, lr}
+        add             r1,  r1,  r2
+        b               put_h264_qpel8_mc21
+        .endfunc
+
+function ff_put_h264_qpel8_mc33_neon, export=1
+        add             r1,  r1,  #1
+        push            {r0, r1, r2, lr}
+        add             r1,  r1,  r2
+        sub             r1,  r1,  #1
+        b               put_h264_qpel8_mc11
+        .endfunc
+
+function ff_put_h264_qpel16_mc10_neon, export=1
+        lowpass_const   r3
+        mov             r3,  r1
+        sub             r1,  r1,  #2
+        b               put_h264_qpel16_h_lowpass_l2_neon
+        .endfunc
+
+function ff_put_h264_qpel16_mc20_neon, export=1
+        lowpass_const   r3
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        b               put_h264_qpel16_h_lowpass_neon
+        .endfunc
+
+function ff_put_h264_qpel16_mc30_neon, export=1
+        lowpass_const   r3
+        add             r3,  r1,  #1
+        sub             r1,  r1,  #2
+        b               put_h264_qpel16_h_lowpass_l2_neon
+        .endfunc
+
+function ff_put_h264_qpel16_mc01_neon, export=1
+        push            {r4, lr}
+        mov             ip,  r1
+put_h264_qpel16_mc01:
+        lowpass_const   r3
+        mov             r3,  r2
+        sub             r1,  r1,  r2, lsl #1
+        vpush           {d8-d15}
+        bl              put_h264_qpel16_v_lowpass_l2_neon
+        vpop            {d8-d15}
+        pop             {r4, pc}
+        .endfunc
+
+function ff_put_h264_qpel16_mc11_neon, export=1
+        push            {r0, r1, r4, lr}
+put_h264_qpel16_mc11:
+        lowpass_const   r3
+        sub             sp,  sp,  #256
+        mov             r0,  sp
+        sub             r1,  r1,  #2
+        mov             r3,  #16
+        vpush           {d8-d15}
+        bl              put_h264_qpel16_h_lowpass_neon
+        add             r0,  sp,  #256
+        ldrd            r0,  [r0, #64]
+        mov             r3,  r2
+        add             ip,  sp,  #64
+        sub             r1,  r1,  r2, lsl #1
+        mov             r2,  #16
+        bl              put_h264_qpel16_v_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  sp,  #(256+8)
+        pop             {r4, pc}
+        .endfunc
+
+function ff_put_h264_qpel16_mc21_neon, export=1
+        push            {r0, r1, r4-r5, r9-r11, lr}
+put_h264_qpel16_mc21:
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             sp,  sp,  #(16*16+16*12)
+        sub             r1,  r1,  #2
+        mov             r0,  sp
+        vpush           {d8-d15}
+        bl              put_h264_qpel16_h_lowpass_neon_packed
+        mov             r4,  r0
+        ldrd            r0,  [r11]
+        sub             r1,  r1,  r2, lsl #1
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        bl              put_h264_qpel16_hv_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  r11,  #8
+        pop             {r4-r5, r9-r11, pc}
+        .endfunc
+
+function ff_put_h264_qpel16_mc31_neon, export=1
+        add             r1,  r1,  #1
+        push            {r0, r1, r4, lr}
+        sub             r1,  r1,  #1
+        b               put_h264_qpel16_mc11
+        .endfunc
+
+function ff_put_h264_qpel16_mc02_neon, export=1
+        push            {r4, lr}
+        lowpass_const   r3
+        sub             r1,  r1,  r2, lsl #1
+        mov             r3,  r2
+        vpush           {d8-d15}
+        bl              put_h264_qpel16_v_lowpass_neon
+        vpop            {d8-d15}
+        pop             {r4, pc}
+        .endfunc
+
+function ff_put_h264_qpel16_mc12_neon, export=1
+        push            {r0, r1, r4-r5, r9-r11, lr}
+put_h264_qpel16_mc12:
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             sp,  sp,  #(16*16+16*12)
+        sub             r1,  r1,  r2, lsl #1
+        mov             r0,  sp
+        mov             r3,  r2
+        vpush           {d8-d15}
+        bl              put_h264_qpel16_v_lowpass_neon_packed
+        mov             r4,  r0
+        ldrd            r0,  [r11]
+        sub             r1,  r1,  r3, lsl #1
+        sub             r1,  r1,  #2
+        mov             r2,  r3
+        bl              put_h264_qpel16_hv_lowpass_l2_neon
+        vpop            {d8-d15}
+        add             sp,  r11,  #8
+        pop             {r4-r5, r9-r11, pc}
+        .endfunc
+
+function ff_put_h264_qpel16_mc22_neon, export=1
+        push            {r4, r9-r11, lr}
+        lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
+        sub             r1,  r1,  r2, lsl #1
+        sub             r1,  r1,  #2
+        mov             r3,  r2
+        sub             sp,  sp,  #(16*12)
+        mov             r4,  sp
+        vpush           {d8-d15}
+        bl              put_h264_qpel16_hv_lowpass_neon
+        vpop            {d8-d15}
+        mov             sp,  r11
+        pop             {r4, r9-r11, pc}
+        .endfunc
+
+function ff_put_h264_qpel16_mc32_neon, export=1
+        push            {r0, r1, r4-r5, r9-r11, lr}
+        add             r1,  r1,  #1
+        b               put_h264_qpel16_mc12
+        .endfunc
+
+function ff_put_h264_qpel16_mc03_neon, export=1
+        push            {r4, lr}
+        add             ip,  r1,  r2
+        b               put_h264_qpel16_mc01
+        .endfunc
+
+function ff_put_h264_qpel16_mc13_neon, export=1
+        push            {r0, r1, r4, lr}
+        add             r1,  r1,  r2
+        b               put_h264_qpel16_mc11
+        .endfunc
+
+function ff_put_h264_qpel16_mc23_neon, export=1
+        push            {r0, r1, r4-r5, r9-r11, lr}
+        add             r1,  r1,  r2
+        b               put_h264_qpel16_mc21
+        .endfunc
+
+function ff_put_h264_qpel16_mc33_neon, export=1
+        add             r1,  r1,  #1
+        push            {r0, r1, r4, lr}
+        add             r1,  r1,  r2
+        sub             r1,  r1,  #1
+        b               put_h264_qpel16_mc11
+        .endfunc
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
new file mode 100644
index 0000000..6527390
--- /dev/null
+++ b/libavcodec/arm/h264idct_neon.S
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+        .fpu neon
+
+        .text
+
+function ff_h264_idct_add_neon, export=1
+        mov             r3,  #(1<<5)
+        vmov.i16        d16, #0
+        vmov.16         d16[0],   r3
+        vld1.64         {d0-d3},  [r1,:128]
+        vadd.i16        d0,  d0,  d16
+
+        vswp            d1,  d2
+        vadd.i16        d4,  d0,  d1
+        vshr.s16        q8,  q1,  #1
+        vsub.i16        d5,  d0,  d1
+        vadd.i16        d6,  d2,  d17
+        vsub.i16        d7,  d16, d3
+        vadd.i16        q0,  q2,  q3
+        vsub.i16        q1,  q2,  q3
+
+        vtrn.16         d0,  d1
+        vtrn.16         d3,  d2
+        vtrn.32         d0,  d3
+        vtrn.32         d1,  d2
+
+        vadd.i16        d4,  d0,  d3
+        vld1.32         {d18[0]}, [r0,:32], r2
+        vswp            d1,  d3
+        vshr.s16        q8,  q1,  #1
+        vld1.32         {d19[1]}, [r0,:32], r2
+        vsub.i16        d5,  d0,  d1
+        vld1.32         {d18[1]}, [r0,:32], r2
+        vadd.i16        d6,  d16, d3
+        vld1.32         {d19[0]}, [r0,:32], r2
+        vsub.i16        d7,  d2,  d17
+        sub             r0,  r0,  r2, lsl #2
+        vadd.i16        q0,  q2,  q3
+        vsub.i16        q1,  q2,  q3
+
+        vshr.s16        q0,  q0,  #6
+        vshr.s16        q1,  q1,  #6
+
+        vaddw.u8        q0,  q0,  d18
+        vaddw.u8        q1,  q1,  d19
+
+        vqmovun.s16     d0,  q0
+        vqmovun.s16     d1,  q1
+
+        vst1.32         {d0[0]},  [r0,:32], r2
+        vst1.32         {d1[1]},  [r0,:32], r2
+        vst1.32         {d0[1]},  [r0,:32], r2
+        vst1.32         {d1[0]},  [r0,:32], r2
+
+        bx              lr
+        .endfunc
+
+function ff_h264_idct_dc_add_neon, export=1
+        vld1.16         {d2[],d3[]}, [r1,:16]
+        vrshr.s16       q1,  q1,  #6
+        vld1.32         {d0[0]},  [r0,:32], r2
+        vld1.32         {d0[1]},  [r0,:32], r2
+        vaddw.u8        q2,  q1,  d0
+        vld1.32         {d1[0]},  [r0,:32], r2
+        vld1.32         {d1[1]},  [r0,:32], r2
+        vaddw.u8        q1,  q1,  d1
+        vqmovun.s16     d0,  q2
+        vqmovun.s16     d1,  q1
+        sub             r0,  r0,  r2, lsl #2
+        vst1.32         {d0[0]},  [r0,:32], r2
+        vst1.32         {d0[1]},  [r0,:32], r2
+        vst1.32         {d1[0]},  [r0,:32], r2
+        vst1.32         {d1[1]},  [r0,:32], r2
+        bx              lr
+        .endfunc
+
+function ff_h264_idct_add16_neon, export=1
+        push            {r4-r8,lr}
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r1,  r2
+        mov             r2,  r3
+        ldr             r6,  [sp, #24]
+        movrel          r7,  scan8
+        mov             ip,  #16
+1:      ldrb            r8,  [r7], #1
+        ldr             r0,  [r5], #4
+        ldrb            r8,  [r6, r8]
+        subs            r8,  r8,  #1
+        blt             2f
+        ldrsh           lr,  [r1]
+        add             r0,  r0,  r4
+        movne           lr,  #0
+        cmp             lr,  #0
+        adrne           lr,  ff_h264_idct_dc_add_neon
+        adreq           lr,  ff_h264_idct_add_neon
+        blx             lr
+2:      subs            ip,  ip,  #1
+        add             r1,  r1,  #32
+        bne             1b
+        pop             {r4-r8,pc}
+        .endfunc
+
+function ff_h264_idct_add16intra_neon, export=1
+        push            {r4-r8,lr}
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r1,  r2
+        mov             r2,  r3
+        ldr             r6,  [sp, #24]
+        movrel          r7,  scan8
+        mov             ip,  #16
+1:      ldrb            r8,  [r7], #1
+        ldr             r0,  [r5], #4
+        ldrb            r8,  [r6, r8]
+        add             r0,  r0,  r4
+        cmp             r8,  #0
+        ldrsh           r8,  [r1]
+        adrne           lr,  ff_h264_idct_add_neon
+        adreq           lr,  ff_h264_idct_dc_add_neon
+        cmpeq           r8,  #0
+        blxne           lr
+        subs            ip,  ip,  #1
+        add             r1,  r1,  #32
+        bne             1b
+        pop             {r4-r8,pc}
+        .endfunc
+
+function ff_h264_idct_add8_neon, export=1
+        push            {r4-r10,lr}
+        ldm             r0,  {r4,r9}
+        add             r5,  r1,  #16*4
+        add             r1,  r2,  #16*32
+        mov             r2,  r3
+        ldr             r6,  [sp, #32]
+        movrel          r7,  scan8+16
+        mov             ip,  #8
+1:      ldrb            r8,  [r7], #1
+        ldr             r0,  [r5], #4
+        ldrb            r8,  [r6, r8]
+        tst             ip,  #4
+        addeq           r0,  r0,  r4
+        addne           r0,  r0,  r9
+        cmp             r8,  #0
+        ldrsh           r8,  [r1]
+        adrne           lr,  ff_h264_idct_add_neon
+        adreq           lr,  ff_h264_idct_dc_add_neon
+        cmpeq           r8,  #0
+        blxne           lr
+        subs            ip,  ip,  #1
+        add             r1,  r1,  #32
+        bne             1b
+        pop             {r4-r10,pc}
+        .endfunc
+
+        .section .rodata
+scan8:  .byte           4+1*8, 5+1*8, 4+2*8, 5+2*8
+        .byte           6+1*8, 7+1*8, 6+2*8, 7+2*8
+        .byte           4+3*8, 5+3*8, 4+4*8, 5+4*8
+        .byte           6+3*8, 7+3*8, 6+4*8, 7+4*8
+        .byte           1+1*8, 2+1*8
+        .byte           1+2*8, 2+2*8
+        .byte           1+4*8, 2+4*8
+        .byte           1+5*8, 2+5*8
diff --git a/libavcodec/armv4l/jrevdct_arm.S b/libavcodec/arm/jrevdct_arm.S
similarity index 100%
rename from libavcodec/armv4l/jrevdct_arm.S
rename to libavcodec/arm/jrevdct_arm.S
diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h
new file mode 100644
index 0000000..e36316c
--- /dev/null
+++ b/libavcodec/arm/mathops.h
@@ -0,0 +1,93 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni at gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_MATHOPS_H
+#define AVCODEC_ARM_MATHOPS_H
+
+#include <stdint.h>
+#include "libavutil/common.h"
+
+#   define MULL MULL
+static inline av_const int MULL(int a, int b, unsigned shift)
+{
+    int lo, hi;
+    __asm__("smull %0, %1, %2, %3     \n\t"
+            "mov   %0, %0,     lsr %4 \n\t"
+            "add   %1, %0, %1, lsl %5 \n\t"
+            : "=&r"(lo), "=&r"(hi)
+            : "r"(b), "r"(a), "i"(shift), "i"(32-shift));
+    return hi;
+}
+
+#define MULH MULH
+#ifdef HAVE_ARMV6
+static inline av_const int MULH(int a, int b)
+{
+    int r;
+    __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+    return r;
+}
+#else
+static inline av_const int MULH(int a, int b)
+{
+    int lo, hi;
+    __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a));
+    return hi;
+}
+#endif
+
+static inline av_const int64_t MUL64(int a, int b)
+{
+    union { uint64_t x; unsigned hl[2]; } x;
+    __asm__ ("smull %0, %1, %2, %3"
+             : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b));
+    return x.x;
+}
+#define MUL64 MUL64
+
+static inline av_const int64_t MAC64(int64_t d, int a, int b)
+{
+    union { uint64_t x; unsigned hl[2]; } x = { d };
+    __asm__ ("smlal %0, %1, %2, %3"
+             : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b));
+    return x.x;
+}
+#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
+#define MLS64(d, a, b) MAC64(d, -(a), b)
+
+#if defined(HAVE_ARMV5TE)
+
+/* signed 16x16 -> 32 multiply add accumulate */
+#   define MAC16(rt, ra, rb)                                            \
+    __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
+
+/* signed 16x16 -> 32 multiply */
+#   define MUL16 MUL16
+static inline av_const MUL16(int ra, int rb)
+{
+    int rt;
+    __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
+    return rt;
+}
+
+#endif
+
+#endif /* AVCODEC_ARM_MATHOPS_H */
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
new file mode 100644
index 0000000..18faed2
--- /dev/null
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2002 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+
+void MPV_common_init_iwmmxt(MpegEncContext *s);
+void MPV_common_init_armv5te(MpegEncContext *s);
+
+void MPV_common_init_arm(MpegEncContext *s)
+{
+    /* IWMMXT support is a superset of armv5te, so
+     * allow optimized functions for armv5te unless
+     * a better iwmmxt function exists
+     */
+#ifdef HAVE_ARMV5TE
+    MPV_common_init_armv5te(s);
+#endif
+#ifdef HAVE_IWMMXT
+    MPV_common_init_iwmmxt(s);
+#endif
+}
diff --git a/libavcodec/arm/mpegvideo_armv5te.c b/libavcodec/arm/mpegvideo_armv5te.c
new file mode 100644
index 0000000..b213cf1
--- /dev/null
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -0,0 +1,100 @@
+/*
+ * Optimization of some functions from mpegvideo.c for armv5te
+ * Copyright (c) 2007 Siarhei Siamashka <ssvb at users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+
+void ff_dct_unquantize_h263_armv5te(DCTELEM *block, int qmul, int qadd, int count);
+
+#ifdef ENABLE_ARM_TESTS
+/**
+ * h263 dequantizer supplementary function, it is performance critical and needs to
+ * have optimized implementations for each architecture. Is also used as a reference
+ * implementation in regression tests
+ */
+static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count)
+{
+    int i, level;
+    for (i = 0; i < count; i++) {
+        level = block[i];
+        if (level) {
+            if (level < 0) {
+                level = level * qmul - qadd;
+            } else {
+                level = level * qmul + qadd;
+            }
+            block[i] = level;
+        }
+    }
+}
+#endif
+
+static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
+                                  DCTELEM *block, int n, int qscale)
+{
+    int level, qmul, qadd;
+    int nCoeffs;
+
+    assert(s->block_last_index[n]>=0);
+
+    qmul = qscale << 1;
+
+    if (!s->h263_aic) {
+        if (n < 4)
+            level = block[0] * s->y_dc_scale;
+        else
+            level = block[0] * s->c_dc_scale;
+        qadd = (qscale - 1) | 1;
+    }else{
+        qadd = 0;
+        level = block[0];
+    }
+    if(s->ac_pred)
+        nCoeffs=63;
+    else
+        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+    ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
+    block[0] = level;
+}
+
+static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
+                                  DCTELEM *block, int n, int qscale)
+{
+    int qmul, qadd;
+    int nCoeffs;
+
+    assert(s->block_last_index[n]>=0);
+
+    qadd = (qscale - 1) | 1;
+    qmul = qscale << 1;
+
+    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+    ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
+}
+
+void MPV_common_init_armv5te(MpegEncContext *s)
+{
+    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
+    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
+}
diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S
new file mode 100644
index 0000000..aaa252d
--- /dev/null
+++ b/libavcodec/arm/mpegvideo_armv5te_s.S
@@ -0,0 +1,117 @@
+/*
+ * Optimization of some functions from mpegvideo.c for armv5te
+ * Copyright (c) 2007 Siarhei Siamashka <ssvb at users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+/*
+ * Special optimized version of dct_unquantize_h263_helper_c, it
+ * requires the block to be at least 8 bytes aligned, and may process
+ * more elements than requested.  But it is guaranteed to never
+ * process more than 64 elements provided that count argument is <= 64,
+ * so it is safe. This function is optimized for a common distribution
+ * of values for nCoeffs (they are mostly multiple of 8 plus one or
+ * two extra elements). So this function processes data as 8 elements
+ * per loop iteration and contains optional 2 elements processing in
+ * the end.
+ *
+ * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
+ */
+function ff_dct_unquantize_h263_armv5te, export=1
+        push            {r4-r9,lr}
+        mov             ip, #0
+        subs            r3, r3, #2
+        ble             2f
+        ldrd            r4, [r0, #0]
+1:
+        ldrd            r6, [r0, #8]
+
+        rsbs            r9, ip, r4, asr #16
+        addgt           r9, r2, #0
+        rsblt           r9, r2, #0
+        smlatbne        r9, r4, r1, r9
+
+        rsbs            lr, ip, r5, asr #16
+        addgt           lr, r2, #0
+        rsblt           lr, r2, #0
+        smlatbne        lr, r5, r1, lr
+
+        rsbs            r8, ip, r4, asl #16
+        addgt           r8, r2, #0
+        rsblt           r8, r2, #0
+        smlabbne        r4, r4, r1, r8
+
+        rsbs            r8, ip, r5, asl #16
+        addgt           r8, r2, #0
+        rsblt           r8, r2, #0
+        smlabbne        r5, r5, r1, r8
+
+        strh            r4, [r0], #2
+        strh            r9, [r0], #2
+        strh            r5, [r0], #2
+        strh            lr, [r0], #2
+
+        rsbs            r9, ip, r6, asr #16
+        addgt           r9, r2, #0
+        rsblt           r9, r2, #0
+        smlatbne        r9, r6, r1, r9
+
+        rsbs            lr, ip, r7, asr #16
+        addgt           lr, r2, #0
+        rsblt           lr, r2, #0
+        smlatbne        lr, r7, r1, lr
+
+        rsbs            r8, ip, r6, asl #16
+        addgt           r8, r2, #0
+        rsblt           r8, r2, #0
+        smlabbne        r6, r6, r1, r8
+
+        rsbs            r8, ip, r7, asl #16
+        addgt           r8, r2, #0
+        rsblt           r8, r2, #0
+        smlabbne        r7, r7, r1, r8
+
+        strh            r6, [r0], #2
+        strh            r9, [r0], #2
+        strh            r7, [r0], #2
+        strh            lr, [r0], #2
+
+        subs            r3, r3, #8
+        ldrgtd          r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
+        bgt             1b
+
+        adds            r3, r3, #2
+        pople           {r4-r9,pc}
+2:
+        ldrsh           r9, [r0, #0]
+        ldrsh           lr, [r0, #2]
+        mov             r8, r2
+        cmp             r9, #0
+        rsblt           r8, r2, #0
+        smlabbne        r9, r9, r1, r8
+        mov             r8, r2
+        cmp             lr, #0
+        rsblt           r8, r2, #0
+        smlabbne        lr, lr, r1, r8
+        strh            r9, [r0], #2
+        strh            lr, [r0], #2
+        pop             {r4-r9,pc}
+        .endfunc
diff --git a/libavcodec/armv4l/mpegvideo_iwmmxt.c b/libavcodec/arm/mpegvideo_iwmmxt.c
similarity index 100%
rename from libavcodec/armv4l/mpegvideo_iwmmxt.c
rename to libavcodec/arm/mpegvideo_iwmmxt.c
diff --git a/libavcodec/armv4l/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S
similarity index 100%
rename from libavcodec/armv4l/simple_idct_arm.S
rename to libavcodec/arm/simple_idct_arm.S
diff --git a/libavcodec/armv4l/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S
similarity index 100%
rename from libavcodec/armv4l/simple_idct_armv5te.S
rename to libavcodec/arm/simple_idct_armv5te.S
diff --git a/libavcodec/armv4l/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S
similarity index 100%
rename from libavcodec/armv4l/simple_idct_armv6.S
rename to libavcodec/arm/simple_idct_armv6.S
diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
new file mode 100644
index 0000000..e7099a2
--- /dev/null
+++ b/libavcodec/arm/simple_idct_neon.S
@@ -0,0 +1,362 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4c ((1<<(COL_SHIFT-1))/W4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define w1 d0[0]
+#define w2 d0[1]
+#define w3 d0[2]
+#define w4 d0[3]
+#define w5 d1[0]
+#define w6 d1[1]
+#define w7 d1[2]
+#define w4c d1[3]
+
+        .fpu neon
+
+        .macro idct_col4_top
+        vmull.s16       q7,  d6,  w2    /* q9   = W2 * col[2] */
+        vmull.s16       q8,  d6,  w6    /* q10  = W6 * col[2] */
+        vmull.s16       q9,  d4,  w1    /* q9   = W1 * col[1] */
+        vadd.i32        q11, q15, q7
+        vmull.s16       q10, d4,  w3    /* q10  = W3 * col[1] */
+        vadd.i32        q12, q15, q8
+        vmull.s16       q5,  d4,  w5    /* q5   = W5 * col[1] */
+        vsub.i32        q13, q15, q8
+        vmull.s16       q6,  d4,  w7    /* q6   = W7 * col[1] */
+        vsub.i32        q14, q15, q7
+
+        vmlal.s16       q9,  d8,  w3    /* q9  += W3 * col[3] */
+        vmlsl.s16       q10, d8,  w7    /* q10 -= W7 * col[3] */
+        vmlsl.s16       q5,  d8,  w1    /* q5  -= W1 * col[3] */
+        vmlsl.s16       q6,  d8,  w5    /* q6  -= W5 * col[3] */
+        .endm
+
+        .text
+        .align 6
+
+function idct_row4_neon
+        vmov.i32        q15, #(1<<(ROW_SHIFT-1))
+        vld1.64         {d2-d5},  [r2,:128]!
+        vmlal.s16       q15, d2,  w4    /* q15  += W4 * col[0] */
+        vld1.64         {d6,d7},  [r2,:128]!
+        vorr            d10, d3,  d5
+        vld1.64         {d8,d9},  [r2,:128]!
+        add             r2,  r2,  #-64
+
+        vorr            d11, d7,  d9
+        vorr            d10, d10, d11
+        vmov            r3,  r4,  d10
+
+        idct_col4_top
+
+        orrs            r3,  r3,  r4
+        beq             1f
+
+        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
+        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
+        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
+        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
+        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q7
+        vsub.i32        q13, q13, q7
+        vadd.i32        q14, q14, q7
+        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
+        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
+        vmlal.s16       q9,  d9,  w7
+        vmlsl.s16       q10, d9,  w5
+        vmlal.s16       q5,  d9,  w3
+        vmlsl.s16       q6,  d9,  w1
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q8
+        vadd.i32        q13, q13, q8
+        vsub.i32        q14, q14, q7
+
+1:      vadd.i32        q3,  q11, q9
+        vadd.i32        q4,  q12, q10
+        vshrn.i32       d2,  q3,  #ROW_SHIFT
+        vshrn.i32       d4,  q4,  #ROW_SHIFT
+        vadd.i32        q7,  q13, q5
+        vadd.i32        q8,  q14, q6
+        vtrn.16         d2,  d4
+        vshrn.i32       d6,  q7,  #ROW_SHIFT
+        vshrn.i32       d8,  q8,  #ROW_SHIFT
+        vsub.i32        q14, q14, q6
+        vsub.i32        q11, q11, q9
+        vtrn.16         d6,  d8
+        vsub.i32        q13, q13, q5
+        vshrn.i32       d3,  q14, #ROW_SHIFT
+        vtrn.32         d2,  d6
+        vsub.i32        q12, q12, q10
+        vtrn.32         d4,  d8
+        vshrn.i32       d5,  q13, #ROW_SHIFT
+        vshrn.i32       d7,  q12, #ROW_SHIFT
+        vshrn.i32       d9,  q11, #ROW_SHIFT
+
+        vtrn.16         d3,  d5
+        vtrn.16         d7,  d9
+        vtrn.32         d3,  d7
+        vtrn.32         d5,  d9
+
+        vst1.64         {d2-d5},  [r2,:128]!
+        vst1.64         {d6-d9},  [r2,:128]!
+
+        bx              lr
+        .endfunc
+
+function idct_col4_neon
+        mov             ip,  #16
+        vld1.64         {d2}, [r2,:64], ip /* d2 = col[0] */
+        vdup.16         d30, w4c
+        vld1.64         {d4}, [r2,:64], ip /* d3 = col[1] */
+        vadd.i16        d30, d30, d2
+        vld1.64         {d6}, [r2,:64], ip /* d4 = col[2] */
+        vmull.s16       q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
+        vld1.64         {d8}, [r2,:64], ip /* d5 = col[3] */
+
+        ldrd            r4,  [r2]
+        ldrd            r6,  [r2, #16]
+        orrs            r4,  r4,  r5
+
+        idct_col4_top
+        addeq           r2,  r2,  #16
+        beq             1f
+
+        vld1.64         {d3}, [r2,:64], ip /* d6 = col[4] */
+        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q7
+        vsub.i32        q13, q13, q7
+        vadd.i32        q14, q14, q7
+
+1:      orrs            r6,  r6,  r7
+        ldrd            r4,  [r2, #16]
+        addeq           r2,  r2,  #16
+        beq             2f
+
+        vld1.64         {d5}, [r2,:64], ip /* d7 = col[5] */
+        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
+        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
+        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
+        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
+
+2:      orrs            r4,  r4,  r5
+        ldrd            r4,  [r2, #16]
+        addeq           r2,  r2,  #16
+        beq             3f
+
+        vld1.64         {d7}, [r2,:64], ip /* d8 = col[6] */
+        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
+        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q14, q14, q7
+        vsub.i32        q12, q12, q8
+        vadd.i32        q13, q13, q8
+
+3:      orrs            r4,  r4,  r5
+        addeq           r2,  r2,  #16
+        beq             4f
+
+        vld1.64         {d9}, [r2,:64], ip /* d9 = col[7] */
+        vmlal.s16       q9,  d9,  w7
+        vmlsl.s16       q10, d9,  w5
+        vmlal.s16       q5,  d9,  w3
+        vmlsl.s16       q6,  d9,  w1
+
+4:      vaddhn.i32      d2,  q11, q9
+        vaddhn.i32      d3,  q12, q10
+        vaddhn.i32      d4,  q13, q5
+        vaddhn.i32      d5,  q14, q6
+        vsubhn.i32      d9,  q11, q9
+        vsubhn.i32      d8,  q12, q10
+        vsubhn.i32      d7,  q13, q5
+        vsubhn.i32      d6,  q14, q6
+
+        bx              lr
+        .endfunc
+
+        .align 6
+
+function idct_col4_st8_neon
+        vqshrun.s16     d2,  q1,  #COL_SHIFT-16
+        vqshrun.s16     d3,  q2,  #COL_SHIFT-16
+        vqshrun.s16     d4,  q3,  #COL_SHIFT-16
+        vqshrun.s16     d5,  q4,  #COL_SHIFT-16
+        vst1.32         {d2[0]}, [r0,:32], r1
+        vst1.32         {d2[1]}, [r0,:32], r1
+        vst1.32         {d3[0]}, [r0,:32], r1
+        vst1.32         {d3[1]}, [r0,:32], r1
+        vst1.32         {d4[0]}, [r0,:32], r1
+        vst1.32         {d4[1]}, [r0,:32], r1
+        vst1.32         {d5[0]}, [r0,:32], r1
+        vst1.32         {d5[1]}, [r0,:32], r1
+
+        bx              lr
+        .endfunc
+
+        .section .rodata
+        .align 4
+idct_coeff_neon:
+        .short W1, W2, W3, W4, W5, W6, W7, W4c
+        .previous
+
+        .macro idct_start data
+        push            {r4-r7, lr}
+        pld             [\data]
+        pld             [\data, #64]
+        vpush           {d8-d15}
+        movrel          r3,  idct_coeff_neon
+        vld1.64         {d0,d1}, [r3,:128]
+        .endm
+
+        .macro idct_end
+        vpop            {d8-d15}
+        pop             {r4-r7, pc}
+        .endm
+
+/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
+function ff_simple_idct_put_neon, export=1
+        idct_start      r2
+
+        bl              idct_row4_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        bl              idct_col4_st8_neon
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #4
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        bl              idct_col4_st8_neon
+
+        idct_end
+        .endfunc
+
+        .align 6
+
+function idct_col4_add8_neon
+        mov             ip,  r0
+
+        vld1.32         {d10[0]}, [r0,:32], r1
+        vshr.s16        q1,  q1,  #COL_SHIFT-16
+        vld1.32         {d10[1]}, [r0,:32], r1
+        vshr.s16        q2,  q2,  #COL_SHIFT-16
+        vld1.32         {d11[0]}, [r0,:32], r1
+        vshr.s16        q3,  q3,  #COL_SHIFT-16
+        vld1.32         {d11[1]}, [r0,:32], r1
+        vshr.s16        q4,  q4,  #COL_SHIFT-16
+        vld1.32         {d12[0]}, [r0,:32], r1
+        vaddw.u8        q1,  q1,  d10
+        vld1.32         {d12[1]}, [r0,:32], r1
+        vaddw.u8        q2,  q2,  d11
+        vld1.32         {d13[0]}, [r0,:32], r1
+        vqmovun.s16     d2,  q1
+        vld1.32         {d13[1]}, [r0,:32], r1
+        vaddw.u8        q3,  q3,  d12
+        vst1.32         {d2[0]},  [ip,:32], r1
+        vqmovun.s16     d3,  q2
+        vst1.32         {d2[1]},  [ip,:32], r1
+        vaddw.u8        q4,  q4,  d13
+        vst1.32         {d3[0]},  [ip,:32], r1
+        vqmovun.s16     d4,  q3
+        vst1.32         {d3[1]},  [ip,:32], r1
+        vqmovun.s16     d5,  q4
+        vst1.32         {d4[0]},  [ip,:32], r1
+        vst1.32         {d4[1]},  [ip,:32], r1
+        vst1.32         {d5[0]},  [ip,:32], r1
+        vst1.32         {d5[1]},  [ip,:32], r1
+
+        bx              lr
+        .endfunc
+
+/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
+function ff_simple_idct_add_neon, export=1
+        idct_start      r2
+
+        bl              idct_row4_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        bl              idct_col4_add8_neon
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #4
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        bl              idct_col4_add8_neon
+
+        idct_end
+        .endfunc
+
+        .align 6
+
+function idct_col4_st16_neon
+        mov             ip,  #16
+
+        vshr.s16        q1,  q1,  #COL_SHIFT-16
+        vshr.s16        q2,  q2,  #COL_SHIFT-16
+        vst1.64         {d2}, [r2,:64], ip
+        vshr.s16        q3,  q3,  #COL_SHIFT-16
+        vst1.64         {d3}, [r2,:64], ip
+        vshr.s16        q4,  q4,  #COL_SHIFT-16
+        vst1.64         {d4}, [r2,:64], ip
+        vst1.64         {d5}, [r2,:64], ip
+        vst1.64         {d6}, [r2,:64], ip
+        vst1.64         {d7}, [r2,:64], ip
+        vst1.64         {d8}, [r2,:64], ip
+        vst1.64         {d9}, [r2,:64], ip
+
+        bx              lr
+        .endfunc
+
+/* void ff_simple_idct_neon(DCTELEM *data); */
+function ff_simple_idct_neon, export=1
+        idct_start      r0
+
+        mov             r2,  r0
+        bl              idct_row4_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_st16_neon
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_st16_neon
+
+        idct_end
+        .endfunc
diff --git a/libavcodec/armv4l/asm.S b/libavcodec/armv4l/asm.S
deleted file mode 100644
index e2595f4..0000000
--- a/libavcodec/armv4l/asm.S
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-        .macro require8, val=1
-        .eabi_attribute 24, \val
-        .endm
-
-        .macro preserve8, val=1
-        .eabi_attribute 25, \val
-        .endm
-
-        .macro function name, export=0
-.if \export
-        .global \name
-.endif
-        .type   \name, %function
-        .func   \name
-\name:
-        .endm
diff --git a/libavcodec/armv4l/dsputil_arm.c b/libavcodec/armv4l/dsputil_arm.c
deleted file mode 100644
index 695bf1c..0000000
--- a/libavcodec/armv4l/dsputil_arm.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * ARMv4L optimized DSP utils
- * Copyright (c) 2001 Lionel Ulmer.
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/dsputil.h"
-#ifdef HAVE_IPP
-#include <ipp.h>
-#endif
-
-extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
-extern void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx);
-
-extern void j_rev_dct_ARM(DCTELEM *data);
-extern void simple_idct_ARM(DCTELEM *data);
-
-extern void simple_idct_armv5te(DCTELEM *data);
-extern void simple_idct_put_armv5te(uint8_t *dest, int line_size,
-                                    DCTELEM *data);
-extern void simple_idct_add_armv5te(uint8_t *dest, int line_size,
-                                    DCTELEM *data);
-
-extern void ff_simple_idct_armv6(DCTELEM *data);
-extern void ff_simple_idct_put_armv6(uint8_t *dest, int line_size,
-                                     DCTELEM *data);
-extern void ff_simple_idct_add_armv6(uint8_t *dest, int line_size,
-                                     DCTELEM *data);
-
-/* XXX: local hack */
-static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
-static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
-
-void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-extern void ff_prefetch_arm(void *mem, int stride, int h);
-
-CALL_2X_PIXELS(put_pixels16_x2_arm , put_pixels8_x2_arm , 8)
-CALL_2X_PIXELS(put_pixels16_y2_arm , put_pixels8_y2_arm , 8)
-CALL_2X_PIXELS(put_pixels16_xy2_arm, put_pixels8_xy2_arm, 8)
-CALL_2X_PIXELS(put_no_rnd_pixels16_x2_arm , put_no_rnd_pixels8_x2_arm , 8)
-CALL_2X_PIXELS(put_no_rnd_pixels16_y2_arm , put_no_rnd_pixels8_y2_arm , 8)
-CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_arm, put_no_rnd_pixels8_xy2_arm, 8)
-
-extern void ff_add_pixels_clamped_ARM(short *block, unsigned char *dest,
-                                      int line_size);
-
-/* XXX: those functions should be suppressed ASAP when all IDCTs are
-   converted */
-static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    j_rev_dct_ARM (block);
-    ff_put_pixels_clamped(block, dest, line_size);
-}
-static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    j_rev_dct_ARM (block);
-    ff_add_pixels_clamped(block, dest, line_size);
-}
-static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    simple_idct_ARM (block);
-    ff_put_pixels_clamped(block, dest, line_size);
-}
-static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    simple_idct_ARM (block);
-    ff_add_pixels_clamped(block, dest, line_size);
-}
-
-#ifdef HAVE_IPP
-static void simple_idct_ipp(DCTELEM *block)
-{
-    ippiDCT8x8Inv_Video_16s_C1I(block);
-}
-static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
-}
-
-void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
-
-static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ippiDCT8x8Inv_Video_16s_C1I(block);
-#ifdef HAVE_IWMMXT
-    add_pixels_clamped_iwmmxt(block, dest, line_size);
-#else
-    ff_add_pixels_clamped_ARM(block, dest, line_size);
-#endif
-}
-#endif
-
-int mm_support(void)
-{
-    return ENABLE_IWMMXT * FF_MM_IWMMXT;
-}
-
-void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
-{
-    int idct_algo= avctx->idct_algo;
-
-    ff_put_pixels_clamped = c->put_pixels_clamped;
-    ff_add_pixels_clamped = c->add_pixels_clamped;
-
-    if (avctx->lowres == 0) {
-        if(idct_algo == FF_IDCT_AUTO){
-#if defined(HAVE_IPP)
-            idct_algo = FF_IDCT_IPP;
-#elif defined(HAVE_ARMV6)
-            idct_algo = FF_IDCT_SIMPLEARMV6;
-#elif defined(HAVE_ARMV5TE)
-            idct_algo = FF_IDCT_SIMPLEARMV5TE;
-#else
-            idct_algo = FF_IDCT_ARM;
-#endif
-        }
-
-        if(idct_algo==FF_IDCT_ARM){
-            c->idct_put= j_rev_dct_ARM_put;
-            c->idct_add= j_rev_dct_ARM_add;
-            c->idct    = j_rev_dct_ARM;
-            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
-        } else if (idct_algo==FF_IDCT_SIMPLEARM){
-            c->idct_put= simple_idct_ARM_put;
-            c->idct_add= simple_idct_ARM_add;
-            c->idct    = simple_idct_ARM;
-            c->idct_permutation_type= FF_NO_IDCT_PERM;
-#ifdef HAVE_ARMV6
-        } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
-            c->idct_put= ff_simple_idct_put_armv6;
-            c->idct_add= ff_simple_idct_add_armv6;
-            c->idct    = ff_simple_idct_armv6;
-            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
-#endif
-#ifdef HAVE_ARMV5TE
-        } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
-            c->idct_put= simple_idct_put_armv5te;
-            c->idct_add= simple_idct_add_armv5te;
-            c->idct    = simple_idct_armv5te;
-            c->idct_permutation_type = FF_NO_IDCT_PERM;
-#endif
-#ifdef HAVE_IPP
-        } else if (idct_algo==FF_IDCT_IPP){
-            c->idct_put= simple_idct_ipp_put;
-            c->idct_add= simple_idct_ipp_add;
-            c->idct    = simple_idct_ipp;
-            c->idct_permutation_type= FF_NO_IDCT_PERM;
-#endif
-        }
-    }
-
-    c->put_pixels_tab[0][0] = put_pixels16_arm;
-    c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK!
-    c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK!
-    c->put_pixels_tab[0][3] = put_pixels16_xy2_arm;
-    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm;
-    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK
-    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK
-    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm;
-    c->put_pixels_tab[1][0] = put_pixels8_arm; //OK
-    c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK
-    c->put_pixels_tab[1][2] = put_pixels8_y2_arm;
-    c->put_pixels_tab[1][3] = put_pixels8_xy2_arm;
-    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK
-    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK
-    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
-    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;
-
-#ifdef HAVE_ARMV5TE
-    c->prefetch = ff_prefetch_arm;
-#endif
-
-#ifdef HAVE_IWMMXT
-    dsputil_init_iwmmxt(c, avctx);
-#endif
-#ifdef HAVE_ARMVFP
-    ff_float_init_arm_vfp(c, avctx);
-#endif
-}
diff --git a/libavcodec/armv4l/dsputil_arm_s.S b/libavcodec/armv4l/dsputil_arm_s.S
deleted file mode 100644
index ba06f37..0000000
--- a/libavcodec/armv4l/dsputil_arm_s.S
+++ /dev/null
@@ -1,799 +0,0 @@
-@
-@ ARMv4L optimized DSP utils
-@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
-@
-@ This file is part of FFmpeg.
-@
-@ FFmpeg is free software; you can redistribute it and/or
-@ modify it under the terms of the GNU Lesser General Public
-@ License as published by the Free Software Foundation; either
-@ version 2.1 of the License, or (at your option) any later version.
-@
-@ FFmpeg is distributed in the hope that it will be useful,
-@ but WITHOUT ANY WARRANTY; without even the implied warranty of
-@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-@ Lesser General Public License for more details.
-@
-@ You should have received a copy of the GNU Lesser General Public
-@ License along with FFmpeg; if not, write to the Free Software
-@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-@
-
-#include "config.h"
-#include "asm.S"
-
-        preserve8
-
-#ifndef HAVE_PLD
-.macro pld reg
-.endm
-#endif
-
-#ifdef HAVE_ARMV5TE
-function ff_prefetch_arm, export=1
-        subs    r2, r2, #1
-        pld     [r0]
-        add     r0, r0, r1
-        bne     ff_prefetch_arm
-        bx      lr
-        .endfunc
-#endif
-
-.macro  ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
-        mov \Rd0, \Rn0, lsr #(\shift * 8)
-        mov \Rd1, \Rn1, lsr #(\shift * 8)
-        mov \Rd2, \Rn2, lsr #(\shift * 8)
-        mov \Rd3, \Rn3, lsr #(\shift * 8)
-        orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
-        orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
-        orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
-        orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
-.endm
-.macro  ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
-        mov \R0, \R0, lsr #(\shift * 8)
-        orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
-        mov \R1, \R1, lsr #(\shift * 8)
-        orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
-.endm
-.macro  ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
-        mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
-        mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
-        orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
-        orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
-.endm
-
-.macro  RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
-        @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
-        @ Rmask = 0xFEFEFEFE
-        @ Rn = destroy
-        eor \Rd0, \Rn0, \Rm0
-        eor \Rd1, \Rn1, \Rm1
-        orr \Rn0, \Rn0, \Rm0
-        orr \Rn1, \Rn1, \Rm1
-        and \Rd0, \Rd0, \Rmask
-        and \Rd1, \Rd1, \Rmask
-        sub \Rd0, \Rn0, \Rd0, lsr #1
-        sub \Rd1, \Rn1, \Rd1, lsr #1
-.endm
-
-.macro  NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
-        @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
-        @ Rmask = 0xFEFEFEFE
-        @ Rn = destroy
-        eor \Rd0, \Rn0, \Rm0
-        eor \Rd1, \Rn1, \Rm1
-        and \Rn0, \Rn0, \Rm0
-        and \Rn1, \Rn1, \Rm1
-        and \Rd0, \Rd0, \Rmask
-        and \Rd1, \Rd1, \Rmask
-        add \Rd0, \Rn0, \Rd0, lsr #1
-        add \Rd1, \Rn1, \Rd1, lsr #1
-.endm
-
-@ ----------------------------------------------------------------
-        .align 8
-function put_pixels16_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld [r1]
-        stmfd sp!, {r4-r11, lr} @ R14 is also called LR
-        adr r5, 5f
-        ands r4, r1, #3
-        bic r1, r1, #3
-        add r5, r5, r4, lsl #2
-        ldrne pc, [r5]
-1:
-        ldmia r1, {r4-r7}
-        add r1, r1, r2
-        stmia r0, {r4-r7}
-        pld [r1]
-        subs r3, r3, #1
-        add r0, r0, r2
-        bne 1b
-        ldmfd sp!, {r4-r11, pc}
-        .align 8
-2:
-        ldmia r1, {r4-r8}
-        add r1, r1, r2
-        ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
-        pld [r1]
-        subs r3, r3, #1
-        stmia r0, {r9-r12}
-        add r0, r0, r2
-        bne 2b
-        ldmfd sp!, {r4-r11, pc}
-        .align 8
-3:
-        ldmia r1, {r4-r8}
-        add r1, r1, r2
-        ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
-        pld [r1]
-        subs r3, r3, #1
-        stmia r0, {r9-r12}
-        add r0, r0, r2
-        bne 3b
-        ldmfd sp!, {r4-r11, pc}
-        .align 8
-4:
-        ldmia r1, {r4-r8}
-        add r1, r1, r2
-        ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
-        pld [r1]
-        subs r3, r3, #1
-        stmia r0, {r9-r12}
-        add r0, r0, r2
-        bne 4b
-        ldmfd sp!, {r4-r11,pc}
-        .align 8
-5:
-        .word 1b
-        .word 2b
-        .word 3b
-        .word 4b
-        .endfunc
-
-@ ----------------------------------------------------------------
-        .align 8
-function put_pixels8_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld [r1]
-        stmfd sp!, {r4-r5,lr} @ R14 is also called LR
-        adr r5, 5f
-        ands r4, r1, #3
-        bic r1, r1, #3
-        add r5, r5, r4, lsl #2
-        ldrne pc, [r5]
-1:
-        ldmia r1, {r4-r5}
-        add r1, r1, r2
-        subs r3, r3, #1
-        pld [r1]
-        stmia r0, {r4-r5}
-        add r0, r0, r2
-        bne 1b
-        ldmfd sp!, {r4-r5,pc}
-        .align 8
-2:
-        ldmia r1, {r4-r5, r12}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
-        pld [r1]
-        subs r3, r3, #1
-        stmia r0, {r4-r5}
-        add r0, r0, r2
-        bne 2b
-        ldmfd sp!, {r4-r5,pc}
-        .align 8
-3:
-        ldmia r1, {r4-r5, r12}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
-        pld [r1]
-        subs r3, r3, #1
-        stmia r0, {r4-r5}
-        add r0, r0, r2
-        bne 3b
-        ldmfd sp!, {r4-r5,pc}
-        .align 8
-4:
-        ldmia r1, {r4-r5, r12}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
-        pld [r1]
-        subs r3, r3, #1
-        stmia r0, {r4-r5}
-        add r0, r0, r2
-        bne 4b
-        ldmfd sp!, {r4-r5,pc}
-        .align 8
-5:
-        .word 1b
-        .word 2b
-        .word 3b
-        .word 4b
-        .endfunc
-
-@ ----------------------------------------------------------------
-        .align 8
-function put_pixels8_x2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld [r1]
-        stmfd sp!, {r4-r10,lr} @ R14 is also called LR
-        adr r5, 5f
-        ands r4, r1, #3
-        ldr r12, [r5]
-        add r5, r5, r4, lsl #2
-        bic r1, r1, #3
-        ldrne pc, [r5]
-1:
-        ldmia r1, {r4-r5, r10}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
-        pld [r1]
-        RND_AVG32 r8, r9, r4, r5, r6, r7, r12
-        subs r3, r3, #1
-        stmia r0, {r8-r9}
-        add r0, r0, r2
-        bne 1b
-        ldmfd sp!, {r4-r10,pc}
-        .align 8
-2:
-        ldmia r1, {r4-r5, r10}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
-        ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
-        pld [r1]
-        RND_AVG32 r4, r5, r6, r7, r8, r9, r12
-        subs r3, r3, #1
-        stmia r0, {r4-r5}
-        add r0, r0, r2
-        bne 2b
-        ldmfd sp!, {r4-r10,pc}
-        .align 8
-3:
-        ldmia r1, {r4-r5, r10}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
-        ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
-        pld [r1]
-        RND_AVG32 r4, r5, r6, r7, r8, r9, r12
-        subs r3, r3, #1
-        stmia r0, {r4-r5}
-        add r0, r0, r2
-        bne 3b
-        ldmfd sp!, {r4-r10,pc}
-        .align 8
-4:
-        ldmia r1, {r4-r5, r10}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
-        pld [r1]
-        RND_AVG32 r8, r9, r6, r7, r5, r10, r12
-        subs r3, r3, #1
-        stmia r0, {r8-r9}
-        add r0, r0, r2
-        bne 4b
-        ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
-        .align 8
-5:
-        .word 0xFEFEFEFE
-        .word 2b
-        .word 3b
-        .word 4b
-        .endfunc
-
-        .align 8
-function put_no_rnd_pixels8_x2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld [r1]
-        stmfd sp!, {r4-r10,lr} @ R14 is also called LR
-        adr r5, 5f
-        ands r4, r1, #3
-        ldr r12, [r5]
-        add r5, r5, r4, lsl #2
-        bic r1, r1, #3
-        ldrne pc, [r5]
-1:
-        ldmia r1, {r4-r5, r10}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
-        pld [r1]
-        NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
-        subs r3, r3, #1
-        stmia r0, {r8-r9}
-        add r0, r0, r2
-        bne 1b
-        ldmfd sp!, {r4-r10,pc}
-        .align 8
-2:
-        ldmia r1, {r4-r5, r10}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
-        ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
-        pld [r1]
-        NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
-        subs r3, r3, #1
-        stmia r0, {r4-r5}
-        add r0, r0, r2
-        bne 2b
-        ldmfd sp!, {r4-r10,pc}
-        .align 8
-3:
-        ldmia r1, {r4-r5, r10}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
-        ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
-        pld [r1]
-        NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
-        subs r3, r3, #1
-        stmia r0, {r4-r5}
-        add r0, r0, r2
-        bne 3b
-        ldmfd sp!, {r4-r10,pc}
-        .align 8
-4:
-        ldmia r1, {r4-r5, r10}
-        add r1, r1, r2
-        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
-        pld [r1]
-        NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
-        subs r3, r3, #1
-        stmia r0, {r8-r9}
-        add r0, r0, r2
-        bne 4b
-        ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
-        .align 8
-5:
-        .word 0xFEFEFEFE
-        .word 2b
-        .word 3b
-        .word 4b
-        .endfunc
-
-
-@ ----------------------------------------------------------------
-        .align 8
-function put_pixels8_y2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld [r1]
-        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
-        adr r5, 5f
-        ands r4, r1, #3
-        mov r3, r3, lsr #1
-        ldr r12, [r5]
-        add r5, r5, r4, lsl #2
-        bic r1, r1, #3
-        ldrne pc, [r5]
-1:
-        ldmia r1, {r4-r5}
-        add r1, r1, r2
-6:      ldmia r1, {r6-r7}
-        add r1, r1, r2
-        pld [r1]
-        RND_AVG32 r8, r9, r4, r5, r6, r7, r12
-        ldmia r1, {r4-r5}
-        add r1, r1, r2
-        stmia r0, {r8-r9}
-        add r0, r0, r2
-        pld [r1]
-        RND_AVG32 r8, r9, r6, r7, r4, r5, r12
-        subs r3, r3, #1
-        stmia r0, {r8-r9}
-        add r0, r0, r2
-        bne 6b
-        ldmfd sp!, {r4-r11,pc}
-        .align 8
-2:
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
-6:      ldmia r1, {r7-r9}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
-        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
-        subs r3, r3, #1
-        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        bne 6b
-        ldmfd sp!, {r4-r11,pc}
-        .align 8
-3:
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
-6:      ldmia r1, {r7-r9}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
-        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
-        subs r3, r3, #1
-        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        bne 6b
-        ldmfd sp!, {r4-r11,pc}
-        .align 8
-4:
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
-6:      ldmia r1, {r7-r9}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
-        RND_AVG32 r10, r11, r4, r5, r7, r8, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
-        subs r3, r3, #1
-        RND_AVG32 r10, r11, r7, r8, r4, r5, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        bne 6b
-        ldmfd sp!, {r4-r11,pc}
-
-        .align 8
-5:
-        .word 0xFEFEFEFE
-        .word 2b
-        .word 3b
-        .word 4b
-        .endfunc
-
-        .align 8
-function put_no_rnd_pixels8_y2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld [r1]
-        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
-        adr r5, 5f
-        ands r4, r1, #3
-        mov r3, r3, lsr #1
-        ldr r12, [r5]
-        add r5, r5, r4, lsl #2
-        bic r1, r1, #3
-        ldrne pc, [r5]
-1:
-        ldmia r1, {r4-r5}
-        add r1, r1, r2
-6:      ldmia r1, {r6-r7}
-        add r1, r1, r2
-        pld [r1]
-        NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
-        ldmia r1, {r4-r5}
-        add r1, r1, r2
-        stmia r0, {r8-r9}
-        add r0, r0, r2
-        pld [r1]
-        NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
-        subs r3, r3, #1
-        stmia r0, {r8-r9}
-        add r0, r0, r2
-        bne 6b
-        ldmfd sp!, {r4-r11,pc}
-        .align 8
-2:
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
-6:      ldmia r1, {r7-r9}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
-        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
-        subs r3, r3, #1
-        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        bne 6b
-        ldmfd sp!, {r4-r11,pc}
-        .align 8
-3:
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
-6:      ldmia r1, {r7-r9}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
-        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
-        subs r3, r3, #1
-        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        bne 6b
-        ldmfd sp!, {r4-r11,pc}
-        .align 8
-4:
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
-6:      ldmia r1, {r7-r9}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
-        NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        ldmia r1, {r4-r6}
-        add r1, r1, r2
-        pld [r1]
-        ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
-        subs r3, r3, #1
-        NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
-        stmia r0, {r10-r11}
-        add r0, r0, r2
-        bne 6b
-        ldmfd sp!, {r4-r11,pc}
-        .align 8
-5:
-        .word 0xFEFEFEFE
-        .word 2b
-        .word 3b
-        .word 4b
-        .endfunc
-
-@ ----------------------------------------------------------------
-.macro  RND_XY2_IT align
-        @ l1=  (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
-        @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
-.if \align == 0
-        ldmia r1, {r6-r8}
-.elseif \align == 3
-        ldmia r1, {r5-r7}
-.else
-        ldmia r1, {r8-r10}
-.endif
-        add r1, r1, r2
-        pld [r1]
-.if \align == 0
-        ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
-.elseif \align == 1
-        ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
-        ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
-.elseif \align == 2
-        ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
-        ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
-.elseif \align == 3
-        ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
-.endif
-        ldr r14, [r12, #0]      @ 0x03030303
-        tst r3, #1
-        and r8, r4, r14
-        and r9, r5, r14
-        and r10, r6, r14
-        and r11, r7, r14
-        ldreq r14, [r12, #16]   @ 0x02020202/0x01010101
-        add r8, r8, r10
-        add r9, r9, r11
-        addeq r8, r8, r14
-        addeq r9, r9, r14
-        ldr r14, [r12, #20]     @ 0xFCFCFCFC >> 2
-        and r4, r14, r4, lsr #2
-        and r5, r14, r5, lsr #2
-        and r6, r14, r6, lsr #2
-        and r7, r14, r7, lsr #2
-        add r10, r4, r6
-        add r11, r5, r7
-        subs r3, r3, #1
-.endm
-
-.macro RND_XY2_EXPAND align
-        RND_XY2_IT \align
-6:      stmfd sp!, {r8-r11}
-        RND_XY2_IT \align
-        ldmfd sp!, {r4-r7}
-        add r4, r4, r8
-        add r5, r5, r9
-        add r6, r6, r10
-        add r7, r7, r11
-        ldr r14, [r12, #24]     @ 0x0F0F0F0F
-        and r4, r14, r4, lsr #2
-        and r5, r14, r5, lsr #2
-        add r4, r4, r6
-        add r5, r5, r7
-        stmia r0, {r4-r5}
-        add r0, r0, r2
-        bge 6b
-        ldmfd sp!, {r4-r11,pc}
-.endm
-
-        .align 8
-function put_pixels8_xy2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld [r1]
-        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
-        adrl r12, 5f
-        ands r4, r1, #3
-        add r5, r12, r4, lsl #2
-        bic r1, r1, #3
-        ldrne pc, [r5]
-1:
-        RND_XY2_EXPAND 0
-
-        .align 8
-2:
-        RND_XY2_EXPAND 1
-
-        .align 8
-3:
-        RND_XY2_EXPAND 2
-
-        .align 8
-4:
-        RND_XY2_EXPAND 3
-
-5:
-        .word 0x03030303
-        .word 2b
-        .word 3b
-        .word 4b
-        .word 0x02020202
-        .word 0xFCFCFCFC >> 2
-        .word 0x0F0F0F0F
-        .endfunc
-
-        .align 8
-function put_no_rnd_pixels8_xy2_arm, export=1
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-        @ block = word aligned, pixles = unaligned
-        pld [r1]
-        stmfd sp!, {r4-r11,lr} @ R14 is also called LR
-        adrl r12, 5f
-        ands r4, r1, #3
-        add r5, r12, r4, lsl #2
-        bic r1, r1, #3
-        ldrne pc, [r5]
-1:
-        RND_XY2_EXPAND 0
-
-        .align 8
-2:
-        RND_XY2_EXPAND 1
-
-        .align 8
-3:
-        RND_XY2_EXPAND 2
-
-        .align 8
-4:
-        RND_XY2_EXPAND 3
-
-5:
-        .word 0x03030303
-        .word 2b
-        .word 3b
-        .word 4b
-        .word 0x01010101
-        .word 0xFCFCFCFC >> 2
-        .word 0x0F0F0F0F
-        .endfunc
-
-@ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride)
-function ff_add_pixels_clamped_ARM, export=1
-        push            {r4-r10}
-        mov             r10, #8
-1:
-        ldr             r4,  [r1]               /* load dest */
-        /* block[0] and block[1]*/
-        ldrsh           r5,  [r0]
-        ldrsh           r7,  [r0, #2]
-        and             r6,  r4,  #0xFF
-        and             r8,  r4,  #0xFF00
-        add             r6,  r5,  r6
-        add             r8,  r7,  r8,  lsr #8
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        movne           r8,  r7,  lsr #24
-        mov             r9,  r6
-        ldrsh           r5,  [r0, #4]           /* moved form [A] */
-        orr             r9,  r9,  r8, lsl #8
-        /* block[2] and block[3] */
-        /* [A] */
-        ldrsh           r7,  [r0, #6]
-        and             r6,  r4,  #0xFF0000
-        and             r8,  r4,  #0xFF000000
-        add             r6,  r5,  r6, lsr #16
-        add             r8,  r7,  r8, lsr #24
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        movne           r8,  r7,  lsr #24
-        orr             r9,  r9,  r6, lsl #16
-        ldr             r4,  [r1, #4]           /* moved form [B] */
-        orr             r9,  r9,  r8, lsl #24
-        /* store dest */
-        ldrsh           r5,  [r0, #8]           /* moved form [C] */
-        str             r9,  [r1]
-
-        /* load dest */
-        /* [B] */
-        /* block[4] and block[5] */
-        /* [C] */
-        ldrsh           r7,  [r0, #10]
-        and             r6,  r4,  #0xFF
-        and             r8,  r4,  #0xFF00
-        add             r6,  r5,  r6
-        add             r8,  r7,  r8, lsr #8
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        movne           r8,  r7,  lsr #24
-        mov             r9,  r6
-        ldrsh           r5,  [r0, #12]          /* moved from [D] */
-        orr             r9,  r9,  r8, lsl #8
-        /* block[6] and block[7] */
-        /* [D] */
-        ldrsh           r7,  [r0, #14]
-        and             r6,  r4,  #0xFF0000
-        and             r8,  r4,  #0xFF000000
-        add             r6,  r5,  r6, lsr #16
-        add             r8,  r7,  r8, lsr #24
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        movne           r8,  r7,  lsr #24
-        orr             r9,  r9,  r6, lsl #16
-        add             r0,  r0,  #16           /* moved from [E] */
-        orr             r9,  r9,  r8, lsl #24
-        subs            r10, r10, #1            /* moved from [F] */
-        /* store dest */
-        str             r9,  [r1, #4]
-
-        /* [E] */
-        /* [F] */
-        add             r1,  r1,  r2
-        bne             1b
-
-        pop             {r4-r10}
-        bx              lr
-        .endfunc
diff --git a/libavcodec/armv4l/dsputil_iwmmxt_rnd_template.c b/libavcodec/armv4l/dsputil_iwmmxt_rnd_template.c
deleted file mode 100644
index fddbdae..0000000
--- a/libavcodec/armv4l/dsputil_iwmmxt_rnd_template.c
+++ /dev/null
@@ -1,1118 +0,0 @@
-/*
- * iWMMXt optimized DSP utils
- * copyright (c) 2004 AGAWA Koji
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/* This header intentionally has no multiple inclusion guards. It is meant to
- * be included multiple times and generates different code depending on the
- * value of certain #defines. */
-
-void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "and r12, %[pixels], #7 \n\t"
-        "bic %[pixels], %[pixels], #7 \n\t"
-        "tmcr wcgr1, r12 \n\t"
-        "add r4, %[pixels], %[line_size] \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1: \n\t"
-        "wldrd wr0, [%[pixels]] \n\t"
-        "subs %[h], %[h], #2 \n\t"
-        "wldrd wr1, [%[pixels], #8] \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4] \n\t"
-        "pld [%[pixels]] \n\t"
-        "pld [%[pixels], #32] \n\t"
-        "wldrd wr4, [r4, #8] \n\t"
-        "add r4, r4, %[line_size] \n\t"
-        "walignr1 wr8, wr0, wr1 \n\t"
-        "pld [r4] \n\t"
-        "pld [r4, #32] \n\t"
-        "walignr1 wr10, wr3, wr4 \n\t"
-        "wstrd wr8, [%[block]] \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5] \n\t"
-        "add r5, r5, %[line_size] \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "and r12, %[pixels], #7 \n\t"
-        "bic %[pixels], %[pixels], #7 \n\t"
-        "tmcr wcgr1, r12 \n\t"
-        "add r4, %[pixels], %[line_size] \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1: \n\t"
-        "wldrd wr0, [%[pixels]] \n\t"
-        "subs %[h], %[h], #2 \n\t"
-        "wldrd wr1, [%[pixels], #8] \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4] \n\t"
-        "pld [%[pixels]] \n\t"
-        "pld [%[pixels], #32] \n\t"
-        "wldrd wr4, [r4, #8] \n\t"
-        "add r4, r4, %[line_size] \n\t"
-        "walignr1 wr8, wr0, wr1 \n\t"
-        "wldrd wr0, [%[block]] \n\t"
-        "wldrd wr2, [r5] \n\t"
-        "pld [r4] \n\t"
-        "pld [r4, #32] \n\t"
-        "walignr1 wr10, wr3, wr4 \n\t"
-        WAVG2B" wr8, wr8, wr0 \n\t"
-        WAVG2B" wr10, wr10, wr2 \n\t"
-        "wstrd wr8, [%[block]] \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5] \n\t"
-        "pld [%[block]] \n\t"
-        "pld [%[block], #32] \n\t"
-        "add r5, r5, %[line_size] \n\t"
-        "pld [r5] \n\t"
-        "pld [r5, #32] \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "and r12, %[pixels], #7 \n\t"
-        "bic %[pixels], %[pixels], #7 \n\t"
-        "tmcr wcgr1, r12 \n\t"
-        "add r4, %[pixels], %[line_size] \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1: \n\t"
-        "wldrd wr0, [%[pixels]] \n\t"
-        "wldrd wr1, [%[pixels], #8] \n\t"
-        "subs %[h], %[h], #2 \n\t"
-        "wldrd wr2, [%[pixels], #16] \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4] \n\t"
-        "pld [%[pixels]] \n\t"
-        "pld [%[pixels], #32] \n\t"
-        "walignr1 wr8, wr0, wr1 \n\t"
-        "wldrd wr4, [r4, #8] \n\t"
-        "walignr1 wr9, wr1, wr2 \n\t"
-        "wldrd wr5, [r4, #16] \n\t"
-        "add r4, r4, %[line_size] \n\t"
-        "pld [r4] \n\t"
-        "pld [r4, #32] \n\t"
-        "walignr1 wr10, wr3, wr4 \n\t"
-        "wstrd wr8, [%[block]] \n\t"
-        "walignr1 wr11, wr4, wr5 \n\t"
-        "wstrd wr9, [%[block], #8] \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5] \n\t"
-        "wstrd wr11, [r5, #8] \n\t"
-        "add r5, r5, %[line_size] \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    __asm__ volatile (
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "1:                             \n\t"
-        "wldrd wr0, [%[pixels]]         \n\t"
-        "wldrd wr1, [%[pixels], #8]     \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wldrd wr2, [%[pixels], #16]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr3, [r4]                \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr8, wr0, wr1         \n\t"
-        "wldrd wr4, [r4, #8]            \n\t"
-        "walignr1 wr9, wr1, wr2         \n\t"
-        "wldrd wr5, [r4, #16]           \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "wldrd wr0, [%[block]]          \n\t"
-        "pld [r4]                       \n\t"
-        "wldrd wr1, [%[block], #8]      \n\t"
-        "pld [r4, #32]                  \n\t"
-        "wldrd wr2, [r5]                \n\t"
-        "walignr1 wr10, wr3, wr4        \n\t"
-        "wldrd wr3, [r5, #8]            \n\t"
-        WAVG2B" wr8, wr8, wr0           \n\t"
-        WAVG2B" wr9, wr9, wr1           \n\t"
-        WAVG2B" wr10, wr10, wr2         \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "walignr1 wr11, wr4, wr5        \n\t"
-        WAVG2B" wr11, wr11, wr3         \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size] \n\t"
-        "wstrd wr10, [r5]               \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "wstrd wr11, [r5, #8]           \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-        "bne 1b \n\t"
-        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
-        :
-        : "memory", "r4", "r5", "r12");
-}
-
-void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wldrd wr15, [r4, #16]          \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "walignr1 wr3, wr14, wr15       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr5, wr12               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "wmoveq wr7, wr15               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "walignr2ne wr5, wr11, wr12     \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        "walignr2ne wr7, wr14, wr15     \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        WAVG2B" wr1, wr1, wr5           \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        "wstrd wr1, [%[block], #8]      \n\t"
-        WAVG2B" wr3, wr3, wr7           \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr3, [r5, #8]            \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        "wldrd wr12, [r5]               \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        WAVG2B" wr0, wr0, wr10          \n\t"
-        WAVG2B" wr2, wr2, wr12          \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "add r4, %[pixels], %[line_size]\n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add r5, %[block], %[line_size] \n\t"
-        "mov %[line_size], %[line_size], lsl #1 \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "wldrd wr13, [r4]               \n\t"
-        "pld [%[pixels]]                \n\t"
-        "wldrd wr14, [r4, #8]           \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wldrd wr15, [r4, #16]          \n\t"
-        "add r4, r4, %[line_size]       \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "pld [r4]                       \n\t"
-        "pld [r4, #32]                  \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        "walignr1 wr2, wr13, wr14       \n\t"
-        "walignr1 wr3, wr14, wr15       \n\t"
-        "wmoveq wr4, wr11               \n\t"
-        "wmoveq wr5, wr12               \n\t"
-        "wmoveq wr6, wr14               \n\t"
-        "wmoveq wr7, wr15               \n\t"
-        "walignr2ne wr4, wr10, wr11     \n\t"
-        "walignr2ne wr5, wr11, wr12     \n\t"
-        "walignr2ne wr6, wr13, wr14     \n\t"
-        "walignr2ne wr7, wr14, wr15     \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        WAVG2B" wr0, wr0, wr4           \n\t"
-        "wldrd wr11, [%[block], #8]     \n\t"
-        WAVG2B" wr1, wr1, wr5           \n\t"
-        "wldrd wr12, [r5]               \n\t"
-        WAVG2B" wr2, wr2, wr6           \n\t"
-        "wldrd wr13, [r5, #8]           \n\t"
-        WAVG2B" wr3, wr3, wr7           \n\t"
-        WAVG2B" wr0, wr0, wr10          \n\t"
-        WAVG2B" wr1, wr1, wr11          \n\t"
-        WAVG2B" wr2, wr2, wr12          \n\t"
-        WAVG2B" wr3, wr3, wr13          \n\t"
-        "wstrd wr0, [%[block]]          \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr1, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "wstrd wr2, [r5]                \n\t"
-        "pld [%[block]]                 \n\t"
-        "wstrd wr3, [r5, #8]            \n\t"
-        "add r5, r5, %[line_size]       \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [r5]                       \n\t"
-        "pld [r5, #32]                  \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        :"r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    __asm__ volatile(
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-        "and            r12, %[pixels], #7                      \n\t"
-        "tmcr           wcgr1, r12                              \n\t"
-        "bic            %[pixels], %[pixels], #7                \n\t"
-
-        "wldrd          wr10, [%[pixels]]                       \n\t"
-        "wldrd          wr11, [%[pixels], #8]                   \n\t"
-        "pld            [%[block]]                              \n\t"
-        "add            %[pixels], %[pixels], %[line_size]      \n\t"
-        "walignr1       wr0, wr10, wr11                         \n\t"
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-
-      "1:                                                       \n\t"
-        "wldrd          wr10, [%[pixels]]                       \n\t"
-        "wldrd          wr11, [%[pixels], #8]                   \n\t"
-        "add            %[pixels], %[pixels], %[line_size]      \n\t"
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-        "walignr1       wr4, wr10, wr11                         \n\t"
-        "wldrd          wr10, [%[block]]                        \n\t"
-         WAVG2B"        wr8, wr0, wr4                           \n\t"
-         WAVG2B"        wr8, wr8, wr10                          \n\t"
-        "wstrd          wr8, [%[block]]                         \n\t"
-        "add            %[block], %[block], %[line_size]        \n\t"
-
-        "wldrd          wr10, [%[pixels]]                       \n\t"
-        "wldrd          wr11, [%[pixels], #8]                   \n\t"
-        "pld            [%[block]]                              \n\t"
-        "add            %[pixels], %[pixels], %[line_size]      \n\t"
-        "pld            [%[pixels]]                             \n\t"
-        "pld            [%[pixels], #32]                        \n\t"
-        "walignr1       wr0, wr10, wr11                         \n\t"
-        "wldrd          wr10, [%[block]]                        \n\t"
-         WAVG2B"        wr8, wr0, wr4                           \n\t"
-         WAVG2B"        wr8, wr8, wr10                          \n\t"
-        "wstrd          wr8, [%[block]]                         \n\t"
-        "add            %[block], %[block], %[line_size]        \n\t"
-
-        "subs           %[h], %[h], #2                          \n\t"
-        "pld            [%[block]]                              \n\t"
-        "bne            1b                                      \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "cc", "memory", "r12");
-}
-
-void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr4, wr10, wr11       \n\t"
-        "walignr1 wr5, wr11, wr12       \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "subs %[h], %[h], #2            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    int stride = line_size;
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "and r12, %[pixels], #7         \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "pld [%[block]]                 \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-
-        "1:                             \n\t"
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr4, wr10, wr11       \n\t"
-        "walignr1 wr5, wr11, wr12       \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        "wldrd wr11, [%[block], #8]     \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        WAVG2B" wr8, wr8, wr10          \n\t"
-        WAVG2B" wr9, wr9, wr11          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "wldrd wr10, [%[pixels]]        \n\t"
-        "wldrd wr11, [%[pixels], #8]    \n\t"
-        "pld [%[block]]                 \n\t"
-        "wldrd wr12, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr0, wr10, wr11       \n\t"
-        "walignr1 wr1, wr11, wr12       \n\t"
-        "wldrd wr10, [%[block]]         \n\t"
-        "wldrd wr11, [%[block], #8]     \n\t"
-        WAVG2B" wr8, wr0, wr4           \n\t"
-        WAVG2B" wr9, wr1, wr5           \n\t"
-        WAVG2B" wr8, wr8, wr10          \n\t"
-        WAVG2B" wr9, wr9, wr11          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "subs %[h], %[h], #2            \n\t"
-        "pld [%[block]]                 \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
-        :
-        : "r4", "r5", "r12", "memory");
-}
-
-void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "add r12, r12, #1               \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "cmp r12, #8                    \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
-
-void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        /* alignment */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "tmcr wcgr2, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr7, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr6, wr7            \n\t"
-        "wunpckehub wr7, wr7            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr6, wr6, wr10         \n\t"
-        "waddhus wr7, wr7, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        "subs %[h], %[h], #2            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
-
-void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7   \n\t"
-        "tmcr wcgr1, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "add r12, r12, #1               \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "tmcr wcgr2, r12                \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "cmp r12, #8                    \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
-
-void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
-{
-    // [wr0 wr1 wr2 wr3] for previous line
-    // [wr4 wr5 wr6 wr7] for current line
-    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "pld [%[pixels]]                \n\t"
-        "mov r12, #2                    \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "tmcr wcgr0, r12                \n\t" /* for shift value */
-        /* alignment */
-        "and r12, %[pixels], #7         \n\t"
-        "bic %[pixels], %[pixels], #7           \n\t"
-        "tmcr wcgr1, r12                \n\t"
-        "add r12, r12, #1               \n\t"
-        "tmcr wcgr2, r12                \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "pld [%[pixels]]                \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-
-        "1:                             \n\t"
-        // [wr0 wr1 wr2 wr3]
-        // [wr4 wr5 wr6 wr7] <= *
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "cmp r12, #8                    \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr6, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr7, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr4, wr6            \n\t"
-        "wunpckehub wr5, wr6            \n\t"
-        "wunpckelub wr6, wr7            \n\t"
-        "wunpckehub wr7, wr7            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr4, wr4, wr8          \n\t"
-        "waddhus wr5, wr5, wr9          \n\t"
-        "waddhus wr6, wr6, wr10         \n\t"
-        "waddhus wr7, wr7, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wldrd wr13, [%[block], #8]     \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        WAVG2B" wr9, wr9, wr13          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-
-        // [wr0 wr1 wr2 wr3] <= *
-        // [wr4 wr5 wr6 wr7]
-        "wldrd wr12, [%[pixels]]        \n\t"
-        "pld [%[block]]                 \n\t"
-        "wldrd wr13, [%[pixels], #8]    \n\t"
-        "pld [%[block], #32]            \n\t"
-        "wldrd wr14, [%[pixels], #16]   \n\t"
-        "add %[pixels], %[pixels], %[line_size] \n\t"
-        "walignr1 wr2, wr12, wr13       \n\t"
-        "pld [%[pixels]]                \n\t"
-        "pld [%[pixels], #32]           \n\t"
-        "walignr1 wr3, wr13, wr14       \n\t"
-        "wmoveq wr10, wr13              \n\t"
-        "wmoveq wr11, wr14              \n\t"
-        "walignr2ne wr10, wr12, wr13    \n\t"
-        "walignr2ne wr11, wr13, wr14    \n\t"
-        "wunpckelub wr0, wr2            \n\t"
-        "wunpckehub wr1, wr2            \n\t"
-        "wunpckelub wr2, wr3            \n\t"
-        "wunpckehub wr3, wr3            \n\t"
-        "wunpckelub wr8, wr10           \n\t"
-        "wunpckehub wr9, wr10           \n\t"
-        "wunpckelub wr10, wr11          \n\t"
-        "wunpckehub wr11, wr11          \n\t"
-        "waddhus wr0, wr0, wr8          \n\t"
-        "waddhus wr1, wr1, wr9          \n\t"
-        "waddhus wr2, wr2, wr10         \n\t"
-        "waddhus wr3, wr3, wr11         \n\t"
-        "waddhus wr8, wr0, wr4          \n\t"
-        "waddhus wr9, wr1, wr5          \n\t"
-        "waddhus wr10, wr2, wr6         \n\t"
-        "waddhus wr11, wr3, wr7         \n\t"
-        "waddhus wr8, wr8, wr15         \n\t"
-        "waddhus wr9, wr9, wr15         \n\t"
-        "waddhus wr10, wr10, wr15       \n\t"
-        "waddhus wr11, wr11, wr15       \n\t"
-        "wsrlhg wr8, wr8, wcgr0         \n\t"
-        "wsrlhg wr9, wr9, wcgr0         \n\t"
-        "wldrd wr12, [%[block]]         \n\t"
-        "wldrd wr13, [%[block], #8]     \n\t"
-        "wsrlhg wr10, wr10, wcgr0       \n\t"
-        "wsrlhg wr11, wr11, wcgr0       \n\t"
-        "wpackhus wr8, wr8, wr9         \n\t"
-        "wpackhus wr9, wr10, wr11       \n\t"
-        WAVG2B" wr8, wr8, wr12          \n\t"
-        WAVG2B" wr9, wr9, wr13          \n\t"
-        "wstrd wr8, [%[block]]          \n\t"
-        "wstrd wr9, [%[block], #8]      \n\t"
-        "add %[block], %[block], %[line_size]   \n\t"
-        "subs %[h], %[h], #2            \n\t"
-        "pld [%[block]]                 \n\t"
-        "pld [%[block], #32]            \n\t"
-        "bne 1b                         \n\t"
-        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
-        : [line_size]"r"(line_size)
-        : "r12", "memory");
-}
diff --git a/libavcodec/armv4l/dsputil_vfp.S b/libavcodec/armv4l/dsputil_vfp.S
deleted file mode 100644
index 291f2b5..0000000
--- a/libavcodec/armv4l/dsputil_vfp.S
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2008 Siarhei Siamashka <ssvb at users.sourceforge.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "asm.S"
-
-/*
- * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
- * throughput for almost all the instructions (except for double precision
- * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
- * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
- * important for performance. One more interesting feature is that VFP has
- * independent load/store and arithmetics pipelines, so it is possible to make
- * them work simultaneously and get more than 1 operation per cycle. Load/store
- * pipeline can process 2 single precision floating point values per cycle and
- * supports bulk loads and stores for large sets of registers. Arithmetic operations
- * can be done on vectors, which allows to keep the arithmetics pipeline busy,
- * while the processor may issue and execute other instructions. Detailed
- * optimization manuals can be found at http://www.arm.com
- */
-
-/**
- * ARM VFP optimized implementation of 'vector_fmul_c' function.
- * Assume that len is a positive number and is multiple of 8
- */
-@ void ff_vector_fmul_vfp(float *dst, const float *src, int len)
-function ff_vector_fmul_vfp, export=1
-        vpush           {d8-d15}
-        mov             r3,  r0
-        fmrx            r12, fpscr
-        orr             r12, r12, #(3 << 16) /* set vector size to 4 */
-        fmxr            fpscr, r12
-
-        fldmias         r3!, {s0-s3}
-        fldmias         r1!, {s8-s11}
-        fldmias         r3!, {s4-s7}
-        fldmias         r1!, {s12-s15}
-        fmuls           s8,  s0,  s8
-1:
-        subs            r2,  r2,  #16
-        fmuls           s12, s4,  s12
-        fldmiasge       r3!, {s16-s19}
-        fldmiasge       r1!, {s24-s27}
-        fldmiasge       r3!, {s20-s23}
-        fldmiasge       r1!, {s28-s31}
-        fmulsge         s24, s16, s24
-        fstmias         r0!, {s8-s11}
-        fstmias         r0!, {s12-s15}
-        fmulsge         s28, s20, s28
-        fldmiasgt       r3!, {s0-s3}
-        fldmiasgt       r1!, {s8-s11}
-        fldmiasgt       r3!, {s4-s7}
-        fldmiasgt       r1!, {s12-s15}
-        fmulsge         s8,  s0,  s8
-        fstmiasge       r0!, {s24-s27}
-        fstmiasge       r0!, {s28-s31}
-        bgt             1b
-
-        bic             r12, r12, #(7 << 16) /* set vector size back to 1 */
-        fmxr            fpscr, r12
-        vpop            {d8-d15}
-        bx              lr
-        .endfunc
-
-/**
- * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
- * Assume that len is a positive number and is multiple of 8
- */
-@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
-@                                 const float *src1, int len)
-function ff_vector_fmul_reverse_vfp, export=1
-        vpush           {d8-d15}
-        add             r2,  r2,  r3, lsl #2
-        fldmdbs         r2!, {s0-s3}
-        fldmias         r1!, {s8-s11}
-        fldmdbs         r2!, {s4-s7}
-        fldmias         r1!, {s12-s15}
-        fmuls           s8,  s3,  s8
-        fmuls           s9,  s2,  s9
-        fmuls           s10, s1,  s10
-        fmuls           s11, s0,  s11
-1:
-        subs            r3,  r3,  #16
-        fldmdbsge       r2!, {s16-s19}
-        fmuls           s12, s7,  s12
-        fldmiasge       r1!, {s24-s27}
-        fmuls           s13, s6,  s13
-        fldmdbsge       r2!, {s20-s23}
-        fmuls           s14, s5,  s14
-        fldmiasge       r1!, {s28-s31}
-        fmuls           s15, s4,  s15
-        fmulsge         s24, s19, s24
-        fldmdbsgt       r2!, {s0-s3}
-        fmulsge         s25, s18, s25
-        fstmias         r0!, {s8-s13}
-        fmulsge         s26, s17, s26
-        fldmiasgt       r1!, {s8-s11}
-        fmulsge         s27, s16, s27
-        fmulsge         s28, s23, s28
-        fldmdbsgt       r2!, {s4-s7}
-        fmulsge         s29, s22, s29
-        fstmias         r0!, {s14-s15}
-        fmulsge         s30, s21, s30
-        fmulsge         s31, s20, s31
-        fmulsge         s8,  s3,  s8
-        fldmiasgt       r1!, {s12-s15}
-        fmulsge         s9,  s2,  s9
-        fmulsge         s10, s1,  s10
-        fstmiasge       r0!, {s24-s27}
-        fmulsge         s11, s0,  s11
-        fstmiasge       r0!, {s28-s31}
-        bgt             1b
-
-        vpop            {d8-d15}
-        bx              lr
-        .endfunc
-
-#ifdef HAVE_ARMV6
-/**
- * ARM VFP optimized float to int16 conversion.
- * Assume that len is a positive number and is multiple of 8, destination
- * buffer is at least 4 bytes aligned (8 bytes alignment is better for
- * performance), little endian byte sex
- */
-@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
-function ff_float_to_int16_vfp, export=1
-        push            {r4-r8,lr}
-        vpush           {d8-d11}
-        fldmias         r1!, {s16-s23}
-        ftosis          s0,  s16
-        ftosis          s1,  s17
-        ftosis          s2,  s18
-        ftosis          s3,  s19
-        ftosis          s4,  s20
-        ftosis          s5,  s21
-        ftosis          s6,  s22
-        ftosis          s7,  s23
-1:
-        subs            r2,  r2,  #8
-        fmrrs           r3,  r4,  {s0, s1}
-        fmrrs           r5,  r6,  {s2, s3}
-        fmrrs           r7,  r8,  {s4, s5}
-        fmrrs           ip,  lr,  {s6, s7}
-        fldmiasgt       r1!, {s16-s23}
-        ssat            r4,  #16, r4
-        ssat            r3,  #16, r3
-        ssat            r6,  #16, r6
-        ssat            r5,  #16, r5
-        pkhbt           r3,  r3,  r4, lsl #16
-        pkhbt           r4,  r5,  r6, lsl #16
-        ftosisgt        s0,  s16
-        ftosisgt        s1,  s17
-        ftosisgt        s2,  s18
-        ftosisgt        s3,  s19
-        ftosisgt        s4,  s20
-        ftosisgt        s5,  s21
-        ftosisgt        s6,  s22
-        ftosisgt        s7,  s23
-        ssat            r8,  #16, r8
-        ssat            r7,  #16, r7
-        ssat            lr,  #16, lr
-        ssat            ip,  #16, ip
-        pkhbt           r5,  r7,  r8, lsl #16
-        pkhbt           r6,  ip,  lr, lsl #16
-        stmia           r0!, {r3-r6}
-        bgt             1b
-
-        vpop            {d8-d11}
-        pop             {r4-r8,pc}
-        .endfunc
-#endif
diff --git a/libavcodec/armv4l/float_arm_vfp.c b/libavcodec/armv4l/float_arm_vfp.c
deleted file mode 100644
index fb827b3..0000000
--- a/libavcodec/armv4l/float_arm_vfp.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2008 Siarhei Siamashka <ssvb at users.sourceforge.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/dsputil.h"
-
-extern void ff_vector_fmul_vfp(float *dst, const float *src, int len);
-extern void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
-                                       const float *src1, int len);
-extern void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
-
-void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx)
-{
-    c->vector_fmul = ff_vector_fmul_vfp;
-    c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
-#ifdef HAVE_ARMV6
-    c->float_to_int16 = ff_float_to_int16_vfp;
-#endif
-}
diff --git a/libavcodec/armv4l/mathops.h b/libavcodec/armv4l/mathops.h
deleted file mode 100644
index 37a0d3f..0000000
--- a/libavcodec/armv4l/mathops.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * simple math operations
- * Copyright (c) 2006 Michael Niedermayer <michaelni at gmx.at> et al
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARMV4L_MATHOPS_H
-#define AVCODEC_ARMV4L_MATHOPS_H
-
-#include <stdint.h>
-#include "libavutil/common.h"
-
-#ifdef FRAC_BITS
-#   define MULL MULL
-static inline av_const int MULL(int a, int b)
-{
-    int lo, hi;
-    __asm__("smull %0, %1, %2, %3     \n\t"
-            "mov   %0, %0,     lsr %4 \n\t"
-            "add   %1, %0, %1, lsl %5 \n\t"
-            : "=&r"(lo), "=&r"(hi)
-            : "r"(b), "r"(a), "i"(FRAC_BITS), "i"(32-FRAC_BITS));
-    return hi;
-}
-#endif
-
-#define MULH MULH
-#ifdef HAVE_ARMV6
-static inline av_const int MULH(int a, int b)
-{
-    int r;
-    __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
-    return r;
-}
-#else
-static inline av_const int MULH(int a, int b)
-{
-    int lo, hi;
-    __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a));
-    return hi;
-}
-#endif
-
-static inline av_const int64_t MUL64(int a, int b)
-{
-    union { uint64_t x; unsigned hl[2]; } x;
-    __asm__ ("smull %0, %1, %2, %3"
-             : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b));
-    return x.x;
-}
-#define MUL64 MUL64
-
-static inline av_const int64_t MAC64(int64_t d, int a, int b)
-{
-    union { uint64_t x; unsigned hl[2]; } x = { d };
-    __asm__ ("smlal %0, %1, %2, %3"
-             : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b));
-    return x.x;
-}
-#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
-#define MLS64(d, a, b) MAC64(d, -(a), b)
-
-#if defined(HAVE_ARMV5TE)
-
-/* signed 16x16 -> 32 multiply add accumulate */
-#   define MAC16(rt, ra, rb)                                            \
-    __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
-
-/* signed 16x16 -> 32 multiply */
-#   define MUL16 MUL16
-static inline av_const MUL16(int ra, int rb)
-{
-    int rt;
-    __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
-    return rt;
-}
-
-#endif
-
-#endif /* AVCODEC_ARMV4L_MATHOPS_H */
diff --git a/libavcodec/armv4l/mpegvideo_arm.c b/libavcodec/armv4l/mpegvideo_arm.c
deleted file mode 100644
index 1a11d7a..0000000
--- a/libavcodec/armv4l/mpegvideo_arm.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2002 Michael Niedermayer
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/mpegvideo.h"
-
-extern void MPV_common_init_iwmmxt(MpegEncContext *s);
-extern void MPV_common_init_armv5te(MpegEncContext *s);
-
-void MPV_common_init_armv4l(MpegEncContext *s)
-{
-    /* IWMMXT support is a superset of armv5te, so
-     * allow optimized functions for armv5te unless
-     * a better iwmmxt function exists
-     */
-#ifdef HAVE_ARMV5TE
-    MPV_common_init_armv5te(s);
-#endif
-#ifdef HAVE_IWMMXT
-    MPV_common_init_iwmmxt(s);
-#endif
-}
diff --git a/libavcodec/armv4l/mpegvideo_armv5te.c b/libavcodec/armv4l/mpegvideo_armv5te.c
deleted file mode 100644
index 0ecadb4..0000000
--- a/libavcodec/armv4l/mpegvideo_armv5te.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Optimization of some functions from mpegvideo.c for armv5te
- * Copyright (c) 2007 Siarhei Siamashka <ssvb at users.sourceforge.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/mpegvideo.h"
-
-
-#ifdef ENABLE_ARM_TESTS
-/**
- * h263 dequantizer supplementary function, it is performance critical and needs to
- * have optimized implementations for each architecture. Is also used as a reference
- * implementation in regression tests
- */
-static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count)
-{
-    int i, level;
-    for (i = 0; i < count; i++) {
-        level = block[i];
-        if (level) {
-            if (level < 0) {
-                level = level * qmul - qadd;
-            } else {
-                level = level * qmul + qadd;
-            }
-            block[i] = level;
-        }
-    }
-}
-#endif
-
-/* GCC 3.1 or higher is required to support symbolic names in assembly code */
-#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
-
-/**
- * Special optimized version of dct_unquantize_h263_helper_c, it requires the block
- * to be at least 8 bytes aligned, and may process more elements than requested.
- * But it is guaranteed to never process more than 64 elements provided that
- * xxcount argument is <= 64, so it is safe. This macro is optimized for a common
- * distribution of values for nCoeffs (they are mostly multiple of 8 plus one or
- * two extra elements). So this macro processes data as 8 elements per loop iteration
- * and contains optional 2 elements processing in the end.
- *
- * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
- */
-#define dct_unquantize_h263_special_helper_armv5te(xxblock, xxqmul, xxqadd, xxcount) \
-({ DCTELEM *xblock = xxblock; \
-   int xqmul = xxqmul, xqadd = xxqadd, xcount = xxcount, xtmp; \
-   int xdata1, xdata2; \
-__asm__ volatile( \
-        "subs %[count], %[count], #2       \n\t" \
-        "ble 2f                            \n\t" \
-        "ldrd r4, [%[block], #0]           \n\t" \
-        "1:                                \n\t" \
-        "ldrd r6, [%[block], #8]           \n\t" \
-\
-        "rsbs %[data1], %[zero], r4, asr #16 \n\t" \
-        "addgt %[data1], %[qadd], #0       \n\t" \
-        "rsblt %[data1], %[qadd], #0       \n\t" \
-        "smlatbne %[data1], r4, %[qmul], %[data1] \n\t" \
-\
-        "rsbs %[data2], %[zero], r5, asr #16 \n\t" \
-        "addgt %[data2], %[qadd], #0       \n\t" \
-        "rsblt %[data2], %[qadd], #0       \n\t" \
-        "smlatbne %[data2], r5, %[qmul], %[data2] \n\t" \
-\
-        "rsbs %[tmp], %[zero], r4, asl #16 \n\t" \
-        "addgt %[tmp], %[qadd], #0         \n\t" \
-        "rsblt %[tmp], %[qadd], #0         \n\t" \
-        "smlabbne r4, r4, %[qmul], %[tmp]  \n\t" \
-\
-        "rsbs %[tmp], %[zero], r5, asl #16 \n\t" \
-        "addgt %[tmp], %[qadd], #0         \n\t" \
-        "rsblt %[tmp], %[qadd], #0         \n\t" \
-        "smlabbne r5, r5, %[qmul], %[tmp]  \n\t" \
-\
-        "strh r4, [%[block]], #2           \n\t" \
-        "strh %[data1], [%[block]], #2     \n\t" \
-        "strh r5, [%[block]], #2           \n\t" \
-        "strh %[data2], [%[block]], #2     \n\t" \
-\
-        "rsbs %[data1], %[zero], r6, asr #16 \n\t" \
-        "addgt %[data1], %[qadd], #0        \n\t" \
-        "rsblt %[data1], %[qadd], #0        \n\t" \
-        "smlatbne %[data1], r6, %[qmul], %[data1] \n\t" \
-\
-        "rsbs %[data2], %[zero], r7, asr #16 \n\t" \
-        "addgt %[data2], %[qadd], #0        \n\t" \
-        "rsblt %[data2], %[qadd], #0        \n\t" \
-        "smlatbne %[data2], r7, %[qmul], %[data2] \n\t" \
-\
-        "rsbs %[tmp], %[zero], r6, asl #16  \n\t" \
-        "addgt %[tmp], %[qadd], #0          \n\t" \
-        "rsblt %[tmp], %[qadd], #0          \n\t" \
-        "smlabbne r6, r6, %[qmul], %[tmp]   \n\t" \
-\
-        "rsbs %[tmp], %[zero], r7, asl #16  \n\t" \
-        "addgt %[tmp], %[qadd], #0          \n\t" \
-        "rsblt %[tmp], %[qadd], #0          \n\t" \
-        "smlabbne r7, r7, %[qmul], %[tmp]   \n\t" \
-\
-        "strh r6, [%[block]], #2            \n\t" \
-        "strh %[data1], [%[block]], #2      \n\t" \
-        "strh r7, [%[block]], #2            \n\t" \
-        "strh %[data2], [%[block]], #2      \n\t" \
-\
-        "subs %[count], %[count], #8        \n\t" \
-        "ldrgtd r4, [%[block], #0]          \n\t" /* load data early to avoid load/use pipeline stall */ \
-        "bgt 1b                             \n\t" \
-\
-        "adds %[count], %[count], #2        \n\t" \
-        "ble  3f                            \n\t" \
-        "2:                                 \n\t" \
-        "ldrsh %[data1], [%[block], #0]     \n\t" \
-        "ldrsh %[data2], [%[block], #2]     \n\t" \
-        "mov  %[tmp], %[qadd]               \n\t" \
-        "cmp  %[data1], #0                  \n\t" \
-        "rsblt %[tmp], %[qadd], #0          \n\t" \
-        "smlabbne %[data1], %[data1], %[qmul], %[tmp] \n\t" \
-        "mov  %[tmp], %[qadd]               \n\t" \
-        "cmp  %[data2], #0                  \n\t" \
-        "rsblt %[tmp], %[qadd], #0          \n\t" \
-        "smlabbne %[data2], %[data2], %[qmul], %[tmp] \n\t" \
-        "strh %[data1], [%[block]], #2      \n\t" \
-        "strh %[data2], [%[block]], #2      \n\t" \
-        "3:                                 \n\t" \
-        : [block] "+&r" (xblock), [count] "+&r" (xcount), [tmp] "=&r" (xtmp), \
-          [data1] "=&r" (xdata1), [data2] "=&r" (xdata2)  \
-        : [qmul] "r" (xqmul), [qadd] "r" (xqadd), [zero] "r" (0) \
-        : "r4", "r5", "r6", "r7", "cc", "memory" \
-); \
-})
-
-static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
-                                  DCTELEM *block, int n, int qscale)
-{
-    int level, qmul, qadd;
-    int nCoeffs;
-
-    assert(s->block_last_index[n]>=0);
-
-    qmul = qscale << 1;
-
-    if (!s->h263_aic) {
-        if (n < 4)
-            level = block[0] * s->y_dc_scale;
-        else
-            level = block[0] * s->c_dc_scale;
-        qadd = (qscale - 1) | 1;
-    }else{
-        qadd = 0;
-        level = block[0];
-    }
-    if(s->ac_pred)
-        nCoeffs=63;
-    else
-        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
-    dct_unquantize_h263_special_helper_armv5te(block, qmul, qadd, nCoeffs + 1);
-    block[0] = level;
-}
-
-static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
-                                  DCTELEM *block, int n, int qscale)
-{
-    int qmul, qadd;
-    int nCoeffs;
-
-    assert(s->block_last_index[n]>=0);
-
-    qadd = (qscale - 1) | 1;
-    qmul = qscale << 1;
-
-    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
-    dct_unquantize_h263_special_helper_armv5te(block, qmul, qadd, nCoeffs + 1);
-}
-
-#define HAVE_DCT_UNQUANTIZE_H263_ARMV5TE_OPTIMIZED
-
-#endif
-
-void MPV_common_init_armv5te(MpegEncContext *s)
-{
-#ifdef HAVE_DCT_UNQUANTIZE_H263_ARMV5TE_OPTIMIZED
-    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
-    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
-#endif
-}
diff --git a/libavcodec/audioconvert.c b/libavcodec/audioconvert.c
index 8e09e4c..8c6a6b8 100644
--- a/libavcodec/audioconvert.c
+++ b/libavcodec/audioconvert.c
@@ -104,7 +104,7 @@ static const struct {
     const char *name;
     int         nb_channels;
     int64_t     layout;
-} const channel_layout_map[] = {
+} channel_layout_map[] = {
     { "mono",        1,  CH_LAYOUT_MONO },
     { "stereo",      2,  CH_LAYOUT_STEREO },
     { "surround",    3,  CH_LAYOUT_SURROUND },
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 7d0cf0e..3d3e244 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -30,7 +30,7 @@
 #include "libavutil/avutil.h"
 
 #define LIBAVCODEC_VERSION_MAJOR 52
-#define LIBAVCODEC_VERSION_MINOR  3
+#define LIBAVCODEC_VERSION_MINOR 10
 #define LIBAVCODEC_VERSION_MICRO  0
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
@@ -191,6 +191,9 @@ enum CodecID {
     CODEC_ID_TGV,
     CODEC_ID_TGQ,
 
+    /* "codecs" for HW decoding with VDPAU */
+    CODEC_ID_H264_VDPAU= 0x9000,
+
     /* various PCM "codecs" */
     CODEC_ID_PCM_S16LE= 0x10000,
     CODEC_ID_PCM_S16BE,
@@ -400,6 +403,7 @@ enum SampleFormat {
  */
 #define FF_MIN_BUFFER_SIZE 16384
 
+
 /**
  * motion estimation type.
  */
@@ -526,6 +530,10 @@ typedef struct RcOverride{
  * This can be used to prevent truncation of the last audio samples.
  */
 #define CODEC_CAP_SMALL_LAST_FRAME 0x0040
+/**
+ * Codec can export data for HW decoding (VDPAU).
+ */
+#define CODEC_CAP_HWACCEL_VDPAU    0x0080
 
 //The following defines may change, don't expect compatibility if you use them.
 #define MB_TYPE_INTRA4x4   0x0001
@@ -1390,6 +1398,7 @@ typedef struct AVCodecContext {
 #define FF_IDCT_WMV2          19
 #define FF_IDCT_FAAN          20
 #define FF_IDCT_EA            21
+#define FF_IDCT_SIMPLENEON    22
 
     /**
      * slice count
@@ -2081,7 +2090,7 @@ typedef struct AVCodecContext {
     /**
      * number of reference frames
      * - encoding: Set by user.
-     * - decoding: unused
+     * - decoding: Set by lavc.
      */
     int refs;
 
@@ -2283,6 +2292,20 @@ typedef struct AVCodecContext {
      * - decoding: Set by user.
      */
     int64_t request_channel_layout;
+
+    /**
+     * Ratecontrol attempt to use, at maximum, <value> of what can be used without an underflow.
+     * - encoding: Set by user.
+     * - decoding: unused.
+     */
+    float rc_max_available_vbv_use;
+
+    /**
+     * Ratecontrol attempt to use, at least, <value> times the amount needed to prevent a vbv overflow.
+     * - encoding: Set by user.
+     * - decoding: unused.
+     */
+    float rc_min_vbv_overflow_use;
 } AVCodecContext;
 
 /**
@@ -2360,23 +2383,54 @@ typedef struct AVPaletteControl {
 
 } AVPaletteControl attribute_deprecated;
 
+enum AVSubtitleType {
+    SUBTITLE_NONE,
+
+    SUBTITLE_BITMAP,                ///< A bitmap, pict will be set
+
+    /**
+     * Plain text, the text field must be set by the decoder and is
+     * authoritative. ass and pict fields may contain approximations.
+     */
+    SUBTITLE_TEXT,
+
+    /**
+     * Formatted text, the ass field must be set by the decoder and is
+     * authoritative. pict and text fields may contain approximations.
+     */
+    SUBTITLE_ASS,
+};
+
 typedef struct AVSubtitleRect {
-    uint16_t x;
-    uint16_t y;
-    uint16_t w;
-    uint16_t h;
-    uint16_t nb_colors;
-    int linesize;
-    uint32_t *rgba_palette;
-    uint8_t *bitmap;
+    int x;         ///< top left corner  of pict, undefined when pict is not set
+    int y;         ///< top left corner  of pict, undefined when pict is not set
+    int w;         ///< width            of pict, undefined when pict is not set
+    int h;         ///< height           of pict, undefined when pict is not set
+    int nb_colors; ///< number of colors in pict, undefined when pict is not set
+
+    /**
+     * data+linesize for the bitmap of this subtitle.
+     * can be set for text/ass as well once they where rendered
+     */
+    AVPicture pict;
+    enum AVSubtitleType type;
+
+    char *text;                     ///< 0 terminated plain UTF-8 text
+
+    /**
+     * 0 terminated ASS/SSA compatible event line.
+     * The pressentation of this is unaffected by the other values in this
+     * struct.
+     */
+    char *ass;
 } AVSubtitleRect;
 
 typedef struct AVSubtitle {
     uint16_t format; /* 0 = graphics */
     uint32_t start_display_time; /* relative to packet pts, in ms */
     uint32_t end_display_time; /* relative to packet pts, in ms */
-    uint32_t num_rects;
-    AVSubtitleRect *rects;
+    unsigned num_rects;
+    AVSubtitleRect **rects;
 } AVSubtitle;
 
 
@@ -2549,7 +2603,12 @@ unsigned avcodec_version(void);
  */
 void avcodec_init(void);
 
-void register_avcodec(AVCodec *format);
+/**
+ * Register the codec \p codec and initialize libavcodec.
+ *
+ * @see avcodec_init()
+ */
+void register_avcodec(AVCodec *codec);
 
 /**
  * Finds a registered encoder with a matching codec ID.
@@ -2738,6 +2797,9 @@ int avcodec_decode_audio2(AVCodecContext *avctx, int16_t *samples,
  * the linesize is not a multiple of 16 then there's no sense in aligning the
  * start of the buffer to 16.
  *
+ * @note Some codecs have a delay between input and output, these need to be
+ * feeded with buf=NULL, buf_size=0 at the end to return the remaining frames.
+ *
  * @param avctx the codec context
  * @param[out] picture The AVFrame in which the decoded video frame will be stored.
  * @param[in] buf the input buffer
@@ -2805,6 +2867,16 @@ int avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size,
 
 int avcodec_close(AVCodecContext *avctx);
 
+/**
+ * Register all the codecs, parsers and bitstream filters which were enabled at
+ * configuration time. If you do not call this function you can select exactly
+ * which formats you want to support, by using the individual registration
+ * functions.
+ *
+ * @see register_avcodec
+ * @see av_register_codec_parser
+ * @see av_register_bitstream_filter
+ */
 void avcodec_register_all(void);
 
 /**
@@ -2958,7 +3030,7 @@ int av_picture_crop(AVPicture *dst, const AVPicture *src,
 int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width, int pix_fmt,
             int padtop, int padbottom, int padleft, int padright, int *color);
 
-extern unsigned int av_xiphlacing(unsigned char *s, unsigned int v);
+unsigned int av_xiphlacing(unsigned char *s, unsigned int v);
 
 /**
  * Parses \p str and put in \p width_ptr and \p height_ptr the detected values.
@@ -2984,18 +3056,6 @@ int av_parse_video_frame_size(int *width_ptr, int *height_ptr, const char *str);
  */
 int av_parse_video_frame_rate(AVRational *frame_rate, const char *str);
 
-/**
- * Logs a generic warning message about a missing feature.
- * @param[in] avc a pointer to an arbitrary struct of which the first field is
- * a pointer to an AVClass struct
- * @param[in] feature string containing the name of the missing feature
- * @param[in] want_sample indicates if samples are wanted which exhibit this feature.
- * If \p want_sample is non-zero, additional verbage will be added to the log
- * message which tells the user how to report samples to the development
- * mailing list.
- */
-void av_log_missing_feature(void *avc, const char *feature, int want_sample);
-
 /* error handling */
 #if EINVAL > 0
 #define AVERROR(e) (-(e)) /**< Returns a negative error code from a POSIX error code, to return from library functions. */
diff --git a/libavcodec/avs.c b/libavcodec/avs.c
index c60fe63..3b29c85 100644
--- a/libavcodec/avs.c
+++ b/libavcodec/avs.c
@@ -25,35 +25,35 @@
 
 typedef struct {
     AVFrame picture;
-} avs_context_t;
+} AvsContext;
 
 typedef enum {
     AVS_VIDEO     = 0x01,
     AVS_AUDIO     = 0x02,
     AVS_PALETTE   = 0x03,
     AVS_GAME_DATA = 0x04,
-} avs_block_type_t;
+} AvsBlockType;
 
 typedef enum {
     AVS_I_FRAME     = 0x00,
     AVS_P_FRAME_3X3 = 0x01,
     AVS_P_FRAME_2X2 = 0x02,
     AVS_P_FRAME_2X3 = 0x03,
-} avs_video_sub_type_t;
+} AvsVideoSubType;
 
 
 static int
 avs_decode_frame(AVCodecContext * avctx,
                  void *data, int *data_size, const uint8_t * buf, int buf_size)
 {
-    avs_context_t *const avs = avctx->priv_data;
+    AvsContext *const avs = avctx->priv_data;
     AVFrame *picture = data;
     AVFrame *const p = (AVFrame *) & avs->picture;
     const uint8_t *table, *vect;
     uint8_t *out;
     int i, j, x, y, stride, vect_w = 3, vect_h = 3;
-    int sub_type;
-    avs_block_type_t type;
+    AvsVideoSubType sub_type;
+    AvsBlockType type;
     GetBitContext change_map;
 
     if (avctx->reget_buffer(avctx, p)) {
@@ -152,7 +152,7 @@ AVCodec avs_decoder = {
     "avs",
     CODEC_TYPE_VIDEO,
     CODEC_ID_AVS,
-    sizeof(avs_context_t),
+    sizeof(AvsContext),
     avs_decode_init,
     NULL,
     NULL,
diff --git a/libavcodec/bfin/dsputil_bfin.c b/libavcodec/bfin/dsputil_bfin.c
index ccdb19c..1a8ae3b 100644
--- a/libavcodec/bfin/dsputil_bfin.c
+++ b/libavcodec/bfin/dsputil_bfin.c
@@ -29,35 +29,35 @@
 int off;
 
 
-extern void ff_bfin_idct (DCTELEM *block) attribute_l1_text;
-extern void ff_bfin_fdct (DCTELEM *block) attribute_l1_text;
-extern void ff_bfin_vp3_idct (DCTELEM *block);
-extern void ff_bfin_vp3_idct_put (uint8_t *dest, int line_size, DCTELEM *block);
-extern void ff_bfin_vp3_idct_add (uint8_t *dest, int line_size, DCTELEM *block);
-extern void ff_bfin_add_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) attribute_l1_text;
-extern void ff_bfin_put_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) attribute_l1_text;
-extern void ff_bfin_diff_pixels (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)  attribute_l1_text;
-extern void ff_bfin_get_pixels  (DCTELEM *restrict block, const uint8_t *pixels, int line_size) attribute_l1_text;
-extern int  ff_bfin_pix_norm1  (uint8_t * pix, int line_size) attribute_l1_text;
-extern int  ff_bfin_z_sad8x8   (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) attribute_l1_text;
-extern int  ff_bfin_z_sad16x16 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) attribute_l1_text;
-
-extern void ff_bfin_z_put_pixels16_xy2     (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) attribute_l1_text;
-extern void ff_bfin_z_put_pixels8_xy2      (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) attribute_l1_text;
-extern void ff_bfin_put_pixels16_xy2_nornd (uint8_t *block, const uint8_t *s0, int line_size, int h) attribute_l1_text;
-extern void ff_bfin_put_pixels8_xy2_nornd  (uint8_t *block, const uint8_t *s0, int line_size, int h) attribute_l1_text;
-
-
-extern int  ff_bfin_pix_sum (uint8_t *p, int stride) attribute_l1_text;
-
-extern void ff_bfin_put_pixels8uc        (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) attribute_l1_text;
-extern void ff_bfin_put_pixels16uc       (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) attribute_l1_text;
-extern void ff_bfin_put_pixels8uc_nornd  (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) attribute_l1_text;
-extern void ff_bfin_put_pixels16uc_nornd (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) attribute_l1_text;
-
-extern int ff_bfin_sse4  (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) attribute_l1_text;
-extern int ff_bfin_sse8  (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) attribute_l1_text;
-extern int ff_bfin_sse16 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) attribute_l1_text;
+void ff_bfin_idct (DCTELEM *block) attribute_l1_text;
+void ff_bfin_fdct (DCTELEM *block) attribute_l1_text;
+void ff_bfin_vp3_idct (DCTELEM *block);
+void ff_bfin_vp3_idct_put (uint8_t *dest, int line_size, DCTELEM *block);
+void ff_bfin_vp3_idct_add (uint8_t *dest, int line_size, DCTELEM *block);
+void ff_bfin_add_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) attribute_l1_text;
+void ff_bfin_put_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) attribute_l1_text;
+void ff_bfin_diff_pixels (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)  attribute_l1_text;
+void ff_bfin_get_pixels  (DCTELEM *restrict block, const uint8_t *pixels, int line_size) attribute_l1_text;
+int  ff_bfin_pix_norm1  (uint8_t * pix, int line_size) attribute_l1_text;
+int  ff_bfin_z_sad8x8   (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) attribute_l1_text;
+int  ff_bfin_z_sad16x16 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) attribute_l1_text;
+
+void ff_bfin_z_put_pixels16_xy2     (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) attribute_l1_text;
+void ff_bfin_z_put_pixels8_xy2      (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) attribute_l1_text;
+void ff_bfin_put_pixels16_xy2_nornd (uint8_t *block, const uint8_t *s0, int line_size, int h) attribute_l1_text;
+void ff_bfin_put_pixels8_xy2_nornd  (uint8_t *block, const uint8_t *s0, int line_size, int h) attribute_l1_text;
+
+
+int  ff_bfin_pix_sum (uint8_t *p, int stride) attribute_l1_text;
+
+void ff_bfin_put_pixels8uc        (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) attribute_l1_text;
+void ff_bfin_put_pixels16uc       (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) attribute_l1_text;
+void ff_bfin_put_pixels8uc_nornd  (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) attribute_l1_text;
+void ff_bfin_put_pixels16uc_nornd (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) attribute_l1_text;
+
+int ff_bfin_sse4  (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) attribute_l1_text;
+int ff_bfin_sse8  (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) attribute_l1_text;
+int ff_bfin_sse16 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) attribute_l1_text;
 
 
 static void bfin_idct_add (uint8_t *dest, int line_size, DCTELEM *block)
diff --git a/libavcodec/bfin/mpegvideo_bfin.c b/libavcodec/bfin/mpegvideo_bfin.c
index 05743e7..d1c33a1 100644
--- a/libavcodec/bfin/mpegvideo_bfin.c
+++ b/libavcodec/bfin/mpegvideo_bfin.c
@@ -26,7 +26,7 @@
 #include "dsputil_bfin.h"
 
 
-extern void ff_bfin_fdct (DCTELEM *block) attribute_l1_text;
+void ff_bfin_fdct (DCTELEM *block) attribute_l1_text;
 
 
 static int dct_quantize_bfin (MpegEncContext *s,
diff --git a/libavcodec/bfin/vp3_bfin.c b/libavcodec/bfin/vp3_bfin.c
index 62990f6..1906453 100644
--- a/libavcodec/bfin/vp3_bfin.c
+++ b/libavcodec/bfin/vp3_bfin.c
@@ -22,10 +22,10 @@
 #include "libavcodec/dsputil.h"
 #include "dsputil_bfin.h"
 
-extern void ff_bfin_vp3_idct (DCTELEM *block) attribute_l1_text;
-extern void ff_bfin_idct (DCTELEM *block) attribute_l1_text;
-extern void ff_bfin_add_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) attribute_l1_text;
-extern void ff_bfin_put_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) attribute_l1_text;
+void ff_bfin_vp3_idct (DCTELEM *block) attribute_l1_text;
+void ff_bfin_idct (DCTELEM *block) attribute_l1_text;
+void ff_bfin_add_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) attribute_l1_text;
+void ff_bfin_put_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) attribute_l1_text;
 
 /* Intra iDCT offset 128 */
 void ff_bfin_vp3_idct_put (uint8_t *dest, int line_size, DCTELEM *block)
diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h
index 5a5db5c..23bc34d 100644
--- a/libavcodec/bitstream.h
+++ b/libavcodec/bitstream.h
@@ -41,7 +41,7 @@
 //#define ALT_BITSTREAM_WRITER
 //#define ALIGNED_BITSTREAM_WRITER
 #if !defined(LIBMPEG2_BITSTREAM_READER) && !defined(A32_BITSTREAM_READER) && !defined(ALT_BITSTREAM_READER)
-#   ifdef ARCH_ARMV4L
+#   ifdef ARCH_ARM
 #       define A32_BITSTREAM_READER
 #   else
 #       define ALT_BITSTREAM_READER
@@ -179,10 +179,6 @@ typedef struct RL_VLC_ELEM {
     uint8_t run;
 } RL_VLC_ELEM;
 
-#if defined(ARCH_SPARC) || defined(ARCH_ARMV4L) || defined(ARCH_MIPS) || defined(ARCH_BFIN)
-#define UNALIGNED_STORES_ARE_BAD
-#endif
-
 #ifndef ALT_BITSTREAM_WRITER
 static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 {
@@ -200,7 +196,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
 #ifdef BITSTREAM_WRITER_LE
     bit_buf |= value << (32 - bit_left);
     if (n >= bit_left) {
-#ifdef UNALIGNED_STORES_ARE_BAD
+#ifndef HAVE_FAST_UNALIGNED
         if (3 & (intptr_t) s->buf_ptr) {
             s->buf_ptr[0] = bit_buf      ;
             s->buf_ptr[1] = bit_buf >>  8;
@@ -221,7 +217,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
     } else {
         bit_buf<<=bit_left;
         bit_buf |= value >> (n - bit_left);
-#ifdef UNALIGNED_STORES_ARE_BAD
+#ifndef HAVE_FAST_UNALIGNED
         if (3 & (intptr_t) s->buf_ptr) {
             s->buf_ptr[0] = bit_buf >> 24;
             s->buf_ptr[1] = bit_buf >> 16;
diff --git a/libavcodec/bmp.c b/libavcodec/bmp.c
index ca2e201..9a20cd6 100644
--- a/libavcodec/bmp.c
+++ b/libavcodec/bmp.c
@@ -73,25 +73,35 @@ static int bmp_decode_frame(AVCodecContext *avctx,
     buf += 2; /* reserved2 */
 
     hsize = bytestream_get_le32(&buf); /* header size */
-    if(fsize <= hsize){
-        av_log(avctx, AV_LOG_ERROR, "not enough data (%d < %d)\n",
-               fsize, hsize);
-        return -1;
-    }
-
     ihsize = bytestream_get_le32(&buf);       /* more header size */
     if(ihsize + 14 > hsize){
         av_log(avctx, AV_LOG_ERROR, "invalid header size %d\n", hsize);
         return -1;
     }
 
-    if (ihsize == 40) {
+    /* sometimes file size is set to some headers size, set a real size in that case */
+    if(fsize == 14 || fsize == ihsize + 14)
+        fsize = buf_size - 2;
+
+    if(fsize <= hsize){
+        av_log(avctx, AV_LOG_ERROR, "declared file size is less than header size (%d < %d)\n",
+               fsize, hsize);
+        return -1;
+    }
+
+    switch(ihsize){
+    case  40: // windib v3
+    case  64: // OS/2 v2
+    case 108: // windib v4
+    case 124: // windib v5
         width = bytestream_get_le32(&buf);
         height = bytestream_get_le32(&buf);
-    } else if (ihsize == 12) {
+        break;
+    case  12: // OS/2 v1
         width  = bytestream_get_le16(&buf);
         height = bytestream_get_le16(&buf);
-    } else {
+        break;
+    default:
         av_log(avctx, AV_LOG_ERROR, "unsupported BMP file, patch welcome\n");
         return -1;
     }
diff --git a/libavcodec/cavs.h b/libavcodec/cavs.h
index c684acf..2a1a90e 100644
--- a/libavcodec/cavs.h
+++ b/libavcodec/cavs.h
@@ -144,13 +144,13 @@ DECLARE_ALIGNED_8(typedef, struct) {
     int16_t ref;
 } vector_t;
 
-typedef struct dec_2dvlc_t {
+struct dec_2dvlc {
   int8_t rltab[59][3];
   int8_t level_add[27];
   int8_t golomb_order;
   int inc_limit;
   int8_t max_run;
-} dec_2dvlc_t;
+};
 
 typedef struct {
     MpegEncContext s;
@@ -226,9 +226,9 @@ typedef struct {
 
 extern const uint8_t     ff_cavs_dequant_shift[64];
 extern const uint16_t    ff_cavs_dequant_mul[64];
-extern const dec_2dvlc_t ff_cavs_intra_dec[7];
-extern const dec_2dvlc_t ff_cavs_inter_dec[7];
-extern const dec_2dvlc_t ff_cavs_chroma_dec[5];
+extern const struct dec_2dvlc ff_cavs_intra_dec[7];
+extern const struct dec_2dvlc ff_cavs_inter_dec[7];
+extern const struct dec_2dvlc ff_cavs_chroma_dec[5];
 extern const uint8_t     ff_cavs_chroma_qp[64];
 extern const uint8_t     ff_cavs_scan3x3[4];
 extern const uint8_t     ff_cavs_partition_flags[30];
diff --git a/libavcodec/cavsdata.h b/libavcodec/cavsdata.h
index 94b620b..6d80ac1 100644
--- a/libavcodec/cavsdata.h
+++ b/libavcodec/cavsdata.h
@@ -101,7 +101,7 @@ const vector_t ff_cavs_intra_mv = {0,0,1,REF_INTRA};
 
 #define EOB 0,0,0
 
-const dec_2dvlc_t ff_cavs_intra_dec[7] = {
+const struct dec_2dvlc ff_cavs_intra_dec[7] = {
   {
     { //level / run / table_inc
       {  1, 1, 1},{ -1, 1, 1},{  1, 2, 1},{ -1, 2, 1},{  1, 3, 1},{ -1, 3, 1},
@@ -238,7 +238,7 @@ const dec_2dvlc_t ff_cavs_intra_dec[7] = {
   }
 };
 
-const dec_2dvlc_t ff_cavs_inter_dec[7] = {
+const struct dec_2dvlc ff_cavs_inter_dec[7] = {
   {
     { //level / run
       {  1, 1, 1},{ -1, 1, 1},{  1, 2, 1},{ -1, 2, 1},{  1, 3, 1},{ -1, 3, 1},
@@ -375,7 +375,7 @@ const dec_2dvlc_t ff_cavs_inter_dec[7] = {
   }
 };
 
-const dec_2dvlc_t ff_cavs_chroma_dec[5] = {
+const struct dec_2dvlc ff_cavs_chroma_dec[5] = {
   {
     { //level / run
       {  1, 1, 1},{ -1, 1, 1},{  1, 2, 1},{ -1, 2, 1},{  1, 3, 1},{ -1, 3, 1},
diff --git a/libavcodec/cavsdec.c b/libavcodec/cavsdec.c
index a978fe1..18dcb57 100644
--- a/libavcodec/cavsdec.c
+++ b/libavcodec/cavsdec.c
@@ -113,7 +113,7 @@ static inline int get_ue_code(GetBitContext *gb, int order) {
  * @param stride line stride in frame buffer
  */
 static int decode_residual_block(AVSContext *h, GetBitContext *gb,
-                                 const dec_2dvlc_t *r, int esc_golomb_order,
+                                 const struct dec_2dvlc *r, int esc_golomb_order,
                                  int qp, uint8_t *dst, int stride) {
     int i, level_code, esc_code, level, run, mask;
     DCTELEM level_buf[65];
diff --git a/libavcodec/celp_math.h b/libavcodec/celp_math.h
index ce0726f..7cf7861 100644
--- a/libavcodec/celp_math.h
+++ b/libavcodec/celp_math.h
@@ -51,26 +51,6 @@ int ff_exp2(uint16_t power);
 int ff_log2(uint32_t value);
 
 /**
- * returns the dot product.
- * @param a input data array
- * @param b input data array
- * @param length number of elements
- * @param shift right shift by this value will be done after multiplication
- *
- * @return dot product = sum of elementwise products
- */
-static int dot_product(const int16_t* a, const int16_t* b, int length, int shift)
-{
-    int sum = 0;
-    int i;
-
-    for(i=0; i<length; i++)
-        sum += (a[i] * b[i]) >> shift;
-
-    return sum;
-}
-
-/**
  * Shift value left or right depending on sign of offset parameter.
  * @param value value to shift
  * @param offset shift offset
@@ -91,6 +71,6 @@ static inline int bidir_sal(int value, int offset)
  *
  * @return dot product = sum of elementwise products
  */
-extern float ff_dot_productf(const float* a, const float* b, int length);
+float ff_dot_productf(const float* a, const float* b, int length);
 
 #endif /* AVCODEC_CELP_MATH_H */
diff --git a/libavcodec/cinepak.c b/libavcodec/cinepak.c
index 77e5ff1..8c2efba 100644
--- a/libavcodec/cinepak.c
+++ b/libavcodec/cinepak.c
@@ -40,7 +40,7 @@
 typedef struct {
     uint8_t  y0, y1, y2, y3;
     uint8_t  u, v;
-} cvid_codebook_t;
+} cvid_codebook;
 
 #define MAX_STRIPS      32
 
@@ -48,9 +48,9 @@ typedef struct {
     uint16_t          id;
     uint16_t          x1, y1;
     uint16_t          x2, y2;
-    cvid_codebook_t   v4_codebook[256];
-    cvid_codebook_t   v1_codebook[256];
-} cvid_strip_t;
+    cvid_codebook     v4_codebook[256];
+    cvid_codebook     v1_codebook[256];
+} cvid_strip;
 
 typedef struct CinepakContext {
 
@@ -63,13 +63,13 @@ typedef struct CinepakContext {
     int width, height;
 
     int palette_video;
-    cvid_strip_t strips[MAX_STRIPS];
+    cvid_strip strips[MAX_STRIPS];
 
     int sega_film_skip_bytes;
 
 } CinepakContext;
 
-static void cinepak_decode_codebook (cvid_codebook_t *codebook,
+static void cinepak_decode_codebook (cvid_codebook *codebook,
                                      int chunk_id, int size, const uint8_t *data)
 {
     const uint8_t *eod = (data + size);
@@ -118,12 +118,12 @@ static void cinepak_decode_codebook (cvid_codebook_t *codebook,
     }
 }
 
-static int cinepak_decode_vectors (CinepakContext *s, cvid_strip_t *strip,
+static int cinepak_decode_vectors (CinepakContext *s, cvid_strip *strip,
                                    int chunk_id, int size, const uint8_t *data)
 {
     const uint8_t   *eod = (data + size);
     uint32_t         flag, mask;
-    cvid_codebook_t *codebook;
+    cvid_codebook   *codebook;
     unsigned int     x, y;
     uint32_t         iy[4];
     uint32_t         iu[2];
@@ -262,7 +262,7 @@ static int cinepak_decode_vectors (CinepakContext *s, cvid_strip_t *strip,
 }
 
 static int cinepak_decode_strip (CinepakContext *s,
-                                 cvid_strip_t *strip, const uint8_t *data, int size)
+                                 cvid_strip *strip, const uint8_t *data, int size)
 {
     const uint8_t *eod = (data + size);
     int      chunk_id, chunk_size;
diff --git a/libavcodec/cook.c b/libavcodec/cook.c
index ba9f30f..1affe03 100644
--- a/libavcodec/cook.c
+++ b/libavcodec/cook.c
@@ -1177,6 +1177,7 @@ static int cook_decode_init(AVCodecContext *avctx)
     }
 
     avctx->sample_fmt = SAMPLE_FMT_S16;
+    avctx->channel_layout = (avctx->channels==2) ? CH_LAYOUT_STEREO : CH_LAYOUT_MONO;
 
 #ifdef COOKDEBUG
     dump_cook_context(q);
diff --git a/libavcodec/dca.c b/libavcodec/dca.c
index b4f5897..dbd1152 100644
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -88,6 +88,48 @@ static const int64_t dca_core_channel_layout[] = {
     CH_FRONT_LEFT_OF_CENTER|CH_FRONT_CENTER|CH_FRONT_RIGHT_OF_CENTER|CH_LAYOUT_STEREO|CH_SIDE_LEFT|CH_BACK_CENTER|CH_SIDE_RIGHT, ///< 8, CL + C+ CR + L + R + SL + S+ SR
 };
 
+static const int8_t dca_lfe_index[] = {
+    1,2,2,2,2,3,2,3,2,3,2,3,1,3,2,3
+};
+
+static const int8_t dca_channel_reorder_lfe[][8] = {
+    { 0, -1, -1, -1, -1, -1, -1, -1},
+    { 0,  1, -1, -1, -1, -1, -1, -1},
+    { 0,  1, -1, -1, -1, -1, -1, -1},
+    { 0,  1, -1, -1, -1, -1, -1, -1},
+    { 0,  1, -1, -1, -1, -1, -1, -1},
+    { 2,  0,  1, -1, -1, -1, -1, -1},
+    { 0,  1,  3, -1, -1, -1, -1, -1},
+    { 2,  0,  1,  4, -1, -1, -1, -1},
+    { 0,  1,  3,  4, -1, -1, -1, -1},
+    { 2,  0,  1,  4,  5, -1, -1, -1},
+    { 3,  4,  0,  1,  5,  6, -1, -1},
+    { 2,  0,  1,  4,  5,  6, -1, -1},
+    { 0,  6,  4,  5,  2,  3, -1, -1},
+    { 4,  2,  5,  0,  1,  6,  7, -1},
+    { 5,  6,  0,  1,  7,  3,  8,  4},
+    { 4,  2,  5,  0,  1,  6,  8,  7},
+};
+
+static const int8_t dca_channel_reorder_nolfe[][8] = {
+    { 0, -1, -1, -1, -1, -1, -1, -1},
+    { 0,  1, -1, -1, -1, -1, -1, -1},
+    { 0,  1, -1, -1, -1, -1, -1, -1},
+    { 0,  1, -1, -1, -1, -1, -1, -1},
+    { 0,  1, -1, -1, -1, -1, -1, -1},
+    { 2,  0,  1, -1, -1, -1, -1, -1},
+    { 0,  1,  2, -1, -1, -1, -1, -1},
+    { 2,  0,  1,  3, -1, -1, -1, -1},
+    { 0,  1,  2,  3, -1, -1, -1, -1},
+    { 2,  0,  1,  3,  4, -1, -1, -1},
+    { 2,  3,  0,  1,  4,  5, -1, -1},
+    { 2,  0,  1,  3,  4,  5, -1, -1},
+    { 0,  5,  3,  4,  1,  2, -1, -1},
+    { 3,  2,  4,  0,  1,  5,  6, -1},
+    { 4,  5,  0,  1,  6,  2,  7,  3},
+    { 3,  2,  4,  0,  1,  5,  7,  6},
+};
+
 
 #define DCA_DOLBY 101           /* FIXME */
 
@@ -198,6 +240,7 @@ typedef struct {
     uint8_t dca_buffer[DCA_MAX_FRAME_SIZE];
     int dca_buffer_size;        ///< how much data is in the dca_buffer
 
+    const int8_t* channel_order_tab;                             ///< channel reordering table, lfe and non lfe
     GetBitContext gb;
     /* Current position in DCA frame */
     int current_subframe;
@@ -1013,7 +1056,7 @@ static int dca_subsubframe(DCAContext * s)
     for (k = 0; k < s->prim_channels; k++) {
 /*        static float pcm_to_double[8] =
             {32768.0, 32768.0, 524288.0, 524288.0, 0, 8388608.0, 8388608.0};*/
-         qmf_32_subbands(s, k, subband_samples[k], &s->samples[256 * k],
+         qmf_32_subbands(s, k, subband_samples[k], &s->samples[256 * s->channel_order_tab[k]],
                             M_SQRT1_2*s->scale_bias /*pcm_to_double[s->source_pcm_res] */ ,
                             s->add_bias );
     }
@@ -1027,12 +1070,11 @@ static int dca_subsubframe(DCAContext * s)
     /* Generate LFE samples for this subsubframe FIXME!!! */
     if (s->output & DCA_LFE) {
         int lfe_samples = 2 * s->lfe * s->subsubframes;
-        int i_channels = dca_channels[s->output & DCA_CHANNEL_MASK];
 
         lfe_interpolation_fir(s->lfe, 2 * s->lfe,
                               s->lfe_data + lfe_samples +
                               2 * s->lfe * subsubframe,
-                              &s->samples[256 * i_channels],
+                              &s->samples[256 * dca_lfe_index[s->amode]],
                               (1.0/256.0)*s->scale_bias,  s->add_bias);
         /* Outputs 20bits pcm samples */
     }
@@ -1133,8 +1175,9 @@ static int dca_convert_bitstream(const uint8_t * src, int src_size, uint8_t * ds
     PutBitContext pb;
 
     if((unsigned)src_size > (unsigned)max_size) {
-        av_log(NULL, AV_LOG_ERROR, "Input frame size larger then DCA_MAX_FRAME_SIZE!\n");
-        return -1;
+//        av_log(NULL, AV_LOG_ERROR, "Input frame size larger then DCA_MAX_FRAME_SIZE!\n");
+//        return -1;
+        src_size = max_size;
     }
 
     mrk = AV_RB32(src);
@@ -1192,15 +1235,26 @@ static int dca_decode_frame(AVCodecContext * avctx,
     avctx->bit_rate = s->bit_rate;
 
     channels = s->prim_channels + !!s->lfe;
-    if(avctx->request_channels == 2 && s->prim_channels > 2) {
-        channels = 2;
-        s->output = DCA_STEREO;
-        avctx->channel_layout = CH_LAYOUT_STEREO;
-    }
-    if (s->amode<16)
+
+    if (s->amode<16) {
         avctx->channel_layout = dca_core_channel_layout[s->amode];
 
-    if (s->lfe) avctx->channel_layout |= CH_LOW_FREQUENCY;
+        if (s->lfe) {
+            avctx->channel_layout |= CH_LOW_FREQUENCY;
+            s->channel_order_tab = dca_channel_reorder_lfe[s->amode];
+        } else
+            s->channel_order_tab = dca_channel_reorder_nolfe[s->amode];
+
+        if(avctx->request_channels == 2 && s->prim_channels > 2) {
+            channels = 2;
+            s->output = DCA_STEREO;
+            avctx->channel_layout = CH_LAYOUT_STEREO;
+        }
+    } else {
+        av_log(avctx, AV_LOG_ERROR, "Non standard configuration %d !\n",s->amode);
+        return -1;
+    }
+
 
     /* There is nothing that prevents a dts frame to change channel configuration
        but FFmpeg doesn't support that so only set the channels if it is previously
diff --git a/libavcodec/dca.h b/libavcodec/dca.h
index 3895719..02c0a51 100644
--- a/libavcodec/dca.h
+++ b/libavcodec/dca.h
@@ -31,4 +31,7 @@
 #define DCA_MARKER_14B_BE 0x1FFFE800
 #define DCA_MARKER_14B_LE 0xFF1F00E8
 
+/** DCA-HD specific block starts with this marker. */
+#define DCA_HD_MARKER     0x64582025
+
 #endif /* AVCODEC_DCA_H */
diff --git a/libavcodec/dca_parser.c b/libavcodec/dca_parser.c
index f182506..b1c06e4 100644
--- a/libavcodec/dca_parser.c
+++ b/libavcodec/dca_parser.c
@@ -34,6 +34,7 @@ typedef struct DCAParseContext {
     uint32_t lastmarker;
     int size;
     int framesize;
+    int hd_pos;
 } DCAParseContext;
 
 #define IS_MARKER(state, i, buf, buf_size) \
@@ -75,10 +76,16 @@ static int dca_find_frame_end(DCAParseContext * pc1, const uint8_t * buf,
         for (; i < buf_size; i++) {
             pc1->size++;
             state = (state << 8) | buf[i];
-            if (state == pc1->lastmarker && IS_MARKER(state, i, buf, buf_size) && (!pc1->framesize || pc1->framesize == pc1->size)) {
+            if (state == DCA_HD_MARKER && !pc1->hd_pos)
+                pc1->hd_pos = pc1->size;
+            if (state == pc1->lastmarker && IS_MARKER(state, i, buf, buf_size)) {
+                if(pc1->framesize > pc1->size)
+                    continue;
+                if(!pc1->framesize){
+                    pc1->framesize = pc1->hd_pos ? pc1->hd_pos : pc1->size;
+                }
                 pc->frame_start_found = 0;
                 pc->state = -1;
-                pc1->framesize = pc1->size;
                 pc1->size = 0;
                 return i - 3;
             }
diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index 7a80505..04f75b0 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -35,9 +35,10 @@
 #include "libavutil/common.h"
 
 #include "simple_idct.h"
+#include "aandcttab.h"
 #include "faandct.h"
 #include "faanidct.h"
-#include "i386/idct_xvid.h"
+#include "x86/idct_xvid.h"
 
 #undef printf
 #undef random
@@ -45,30 +46,36 @@
 void *fast_memcpy(void *a, const void *b, size_t c){return memcpy(a,b,c);};
 
 /* reference fdct/idct */
-extern void fdct(DCTELEM *block);
-extern void idct(DCTELEM *block);
-extern void init_fdct();
+void fdct(DCTELEM *block);
+void idct(DCTELEM *block);
+void init_fdct();
 
-extern void ff_mmx_idct(DCTELEM *data);
-extern void ff_mmxext_idct(DCTELEM *data);
+void ff_mmx_idct(DCTELEM *data);
+void ff_mmxext_idct(DCTELEM *data);
 
-extern void odivx_idct_c (short *block);
+void odivx_idct_c(short *block);
 
 // BFIN
-extern void ff_bfin_idct (DCTELEM *block)  ;
-extern void ff_bfin_fdct (DCTELEM *block) ;
+void ff_bfin_idct(DCTELEM *block);
+void ff_bfin_fdct(DCTELEM *block);
 
 // ALTIVEC
-extern void fdct_altivec (DCTELEM *block);
-//extern void idct_altivec (DCTELEM *block);?? no routine
+void fdct_altivec(DCTELEM *block);
+//void idct_altivec(DCTELEM *block);?? no routine
 
+// ARM
+void j_rev_dct_ARM(DCTELEM *data);
+void simple_idct_ARM(DCTELEM *data);
+void simple_idct_armv5te(DCTELEM *data);
+void ff_simple_idct_armv6(DCTELEM *data);
+void ff_simple_idct_neon(DCTELEM *data);
 
 struct algo {
   const char *name;
   enum { FDCT, IDCT } is_idct;
   void (* func) (DCTELEM *block);
   void (* ref)  (DCTELEM *block);
-  enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM } format;
+  enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
   int  mm_support;
 };
 
@@ -116,21 +123,24 @@ struct algo algos[] = {
   {"BFINidct",        1, ff_bfin_idct,       idct, NO_PERM},
 #endif
 
+#ifdef ARCH_ARM
+  {"SIMPLE-ARM",      1, simple_idct_ARM,    idct, NO_PERM },
+  {"INT-ARM",         1, j_rev_dct_ARM,      idct, MMX_PERM },
+#ifdef HAVE_ARMV5TE
+  {"SIMPLE-ARMV5TE",  1, simple_idct_armv5te, idct, NO_PERM },
+#endif
+#ifdef HAVE_ARMV6
+  {"SIMPLE-ARMV6",    1, ff_simple_idct_armv6, idct, MMX_PERM },
+#endif
+#ifdef HAVE_NEON
+  {"SIMPLE-NEON",     1, ff_simple_idct_neon, idct, PARTTRANS_PERM },
+#endif
+#endif /* ARCH_ARM */
+
   { 0 }
 };
 
 #define AANSCALE_BITS 12
-static const unsigned short aanscales[64] = {
-    /* precomputed values scaled up by 14 bits */
-    16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-    22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
-    21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
-    19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
-    16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-    12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
-    8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
-    4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
-};
 
 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
 
@@ -245,6 +255,9 @@ void dct_error(const char *name, int is_idct,
         } else if (form == SSE2_PERM) {
             for(i=0; i<64; i++)
                 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
+        } else if (form == PARTTRANS_PERM) {
+            for(i=0; i<64; i++)
+                block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
         } else {
             for(i=0; i<64; i++)
                 block[i]= block1[i];
@@ -263,7 +276,7 @@ void dct_error(const char *name, int is_idct,
 
         if (form == SCALE_PERM) {
             for(i=0; i<64; i++) {
-                scale = 8*(1 << (AANSCALE_BITS + 11)) / aanscales[i];
+                scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
                 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
             }
         }
diff --git a/libavcodec/dirac_parser.c b/libavcodec/dirac_parser.c
index 199354a..ac82dca 100644
--- a/libavcodec/dirac_parser.c
+++ b/libavcodec/dirac_parser.c
@@ -1,7 +1,8 @@
 /*
  * Dirac parser
  *
- * Copyright (c) 2007 Marco Gerards <marco at gnu.org>
+ * Copyright (c) 2007-2008 Marco Gerards <marco at gnu.org>
+ * Copyright (c) 2008 BBC, Anuradha Suraparaju <asuraparaju at gmail.com>
  *
  * This file is part of FFmpeg.
  *
@@ -34,42 +35,200 @@
  * Finds the end of the current frame in the bitstream.
  * @return the position of the first byte of the next frame or -1
  */
-static int find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size)
+typedef struct DiracParseContext {
+    int state;
+    int is_synced;
+    int sync_offset;
+    int header_bytes_needed;
+    int overread_index;
+    int buffer_size;
+    int index;
+    uint8_t *buffer;
+    int dirac_unit_size;
+    uint8_t *dirac_unit;
+} DiracParseContext;
+
+static int find_frame_end(DiracParseContext *pc,
+                          const uint8_t *buf, int buf_size)
 {
     uint32_t state = pc->state;
-    int i;
-
-    for (i = 0; i < buf_size; i++) {
-        state = (state << 8) | buf[i];
-        if (state == DIRAC_PARSE_INFO_PREFIX) {
-            pc->frame_start_found ^= 1;
-            if (!pc->frame_start_found) {
-                pc->state = -1;
-                return i - 3;
+    int i = 0;
+
+    if (!pc->is_synced) {
+        for (i = 0; i < buf_size; i++) {
+            state = (state << 8) | buf[i];
+            if (state == DIRAC_PARSE_INFO_PREFIX) {
+                state                   = -1;
+                pc->is_synced           = 1;
+                pc->header_bytes_needed = 9;
+                pc->sync_offset         = i;
+                break;
             }
         }
     }
 
+    if (pc->is_synced) {
+        pc->sync_offset = 0;
+        for (; i < buf_size; i++) {
+            if (state == DIRAC_PARSE_INFO_PREFIX) {
+                if ((buf_size-i) >= pc->header_bytes_needed) {
+                    pc->state = -1;
+                    return i + pc->header_bytes_needed;
+                } else {
+                    pc->header_bytes_needed = 9-(buf_size-i);
+                    break;
+                }
+            } else
+              state = (state << 8) | buf[i];
+        }
+    }
     pc->state = state;
+    return -1;
+}
+
+typedef struct DiracParseUnit
+{
+    int next_pu_offset;
+    int prev_pu_offset;
+    uint8_t pu_type;
+} DiracParseUnit;
+
+static int unpack_parse_unit(DiracParseUnit *pu, DiracParseContext *pc,
+                             int offset)
+{
+    uint8_t *start = pc->buffer + offset;
+    uint8_t *end   = pc->buffer + pc->index;
+    if (start < pc->buffer || (start+13 > end))
+        return 0;
+    pu->pu_type = start[4];
+
+    pu->next_pu_offset = AV_RB32(start+5);
+    pu->prev_pu_offset = AV_RB32(start+9);
+
+    if (pu->pu_type == 0x10 && pu->next_pu_offset == 0)
+        pu->next_pu_offset = 13;
+
+    return 1;
+}
+
+static int dirac_combine_frame(AVCodecParserContext *s, AVCodecContext *avctx,
+                               int next, const uint8_t **buf, int *buf_size)
+{
+    int parse_timing_info = (s->pts == AV_NOPTS_VALUE &&
+                             s->dts == AV_NOPTS_VALUE);
+    DiracParseContext *pc = s->priv_data;
+
+    if (pc->overread_index) {
+        memcpy(pc->buffer, pc->buffer + pc->overread_index,
+               pc->index - pc->overread_index);
+        pc->index -= pc->overread_index;
+        pc->overread_index = 0;
+        if (*buf_size == 0 && pc->buffer[4] == 0x10) {
+            *buf      = pc->buffer;
+            *buf_size = pc->index;
+            return 0;
+        }
+    }
+
+    if ( next == -1) {
+        /* Found a possible frame start but not a frame end */
+        void *new_buffer = av_fast_realloc(pc->buffer, &pc->buffer_size,
+                                           pc->index + (*buf_size -
+                                                        pc->sync_offset));
+        pc->buffer = new_buffer;
+        memcpy(pc->buffer+pc->index, (*buf + pc->sync_offset),
+               *buf_size - pc->sync_offset);
+        pc->index += *buf_size - pc->sync_offset;
+        return -1;
+    } else {
+        /* Found a possible frame start and a  possible frame end */
+        DiracParseUnit pu1, pu;
+        void *new_buffer = av_fast_realloc(pc->buffer, &pc->buffer_size,
+                                           pc->index + next);
+        pc->buffer = new_buffer;
+        memcpy(pc->buffer + pc->index, *buf, next);
+        pc->index += next;
 
-    return END_NOT_FOUND;
+        /* Need to check if we have a valid Parse Unit. We can't go by the
+         * sync pattern 'BBCD' alone because arithmetic coding of the residual
+         * and motion data can cause the pattern triggering a false start of
+         * frame. So check if the previous parse offset of the next parse unit
+         * is equal to the next parse offset of the current parse unit then
+         * we can be pretty sure that we have a valid parse unit */
+        if (!unpack_parse_unit(&pu1, pc, pc->index - 13)                     ||
+            !unpack_parse_unit(&pu, pc, pc->index - 13 - pu1.prev_pu_offset) ||
+            pu.next_pu_offset != pu1.prev_pu_offset) {
+            pc->index -= 9;
+            *buf_size = next-9;
+            pc->header_bytes_needed = 9;
+            return -1;
+        }
+
+        /* All non-frame data must be accompanied by frame data. This is to
+         * ensure that pts is set correctly. So if the current parse unit is
+         * not frame data, wait for frame data to come along */
+
+        pc->dirac_unit = pc->buffer + pc->index - 13 -
+                         pu1.prev_pu_offset - pc->dirac_unit_size;
+
+        pc->dirac_unit_size += pu.next_pu_offset;
+
+        if ((pu.pu_type&0x08) != 0x08) {
+            pc->header_bytes_needed = 9;
+            *buf_size = next;
+            return -1;
+        }
+
+        /* Get the picture number to set the pts and dts*/
+        if (parse_timing_info) {
+            uint8_t *cur_pu = pc->buffer +
+                              pc->index - 13 - pu1.prev_pu_offset;
+            int pts =  AV_RB32(cur_pu + 13);
+            if (s->last_pts == 0 && s->last_dts == 0)
+                s->dts = pts - 1;
+            else
+                s->dts = s->last_dts+1;
+            s->pts = pts;
+            if (!avctx->has_b_frames && (cur_pu[4] & 0x03))
+                avctx->has_b_frames = 1;
+        }
+        if (avctx->has_b_frames && s->pts == s->dts)
+             s->pict_type = FF_B_TYPE;
+
+        /* Finally have a complete Dirac data unit */
+        *buf      = pc->dirac_unit;
+        *buf_size = pc->dirac_unit_size;
+
+        pc->dirac_unit_size     = 0;
+        pc->overread_index      = pc->index-13;
+        pc->header_bytes_needed = 9;
+    }
+    return next;
 }
 
 static int dirac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
                        const uint8_t **poutbuf, int *poutbuf_size,
                        const uint8_t *buf, int buf_size)
 {
-    ParseContext *pc = s->priv_data;
+    DiracParseContext *pc = s->priv_data;
     int next;
 
+    *poutbuf = NULL;
+    *poutbuf_size = 0;
+
     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
         next = buf_size;
-    }else{
+        *poutbuf = buf;
+        *poutbuf_size = buf_size;
+        /* Assume that data has been packetized into an encapsulation unit. */
+    } else {
         next = find_frame_end(pc, buf, buf_size);
+        if (!pc->is_synced && next == -1) {
+            /* No frame start found yet. So throw away the entire buffer. */
+            return buf_size;
+        }
 
-        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-            *poutbuf = NULL;
-            *poutbuf_size = 0;
+        if (dirac_combine_frame(s, avctx, next, &buf, &buf_size) < 0) {
             return buf_size;
         }
     }
@@ -79,10 +238,18 @@ static int dirac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     return next;
 }
 
+static void dirac_parse_close(AVCodecParserContext *s)
+{
+    DiracParseContext *pc = s->priv_data;
+
+    if (pc->buffer_size > 0)
+        av_free(pc->buffer);
+}
+
 AVCodecParser dirac_parser = {
     { CODEC_ID_DIRAC },
-    sizeof(ParseContext),
+    sizeof(DiracParseContext),
     NULL,
     dirac_parse,
-    ff_parse_close,
+    dirac_parse_close,
 };
diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c
index 1f4f1c8..4bf98de 100644
--- a/libavcodec/dnxhddec.c
+++ b/libavcodec/dnxhddec.c
@@ -219,14 +219,12 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, int x, int y)
     int dct_offset;
     int qscale, i;
 
-    ctx->dsp.clear_blocks(ctx->blocks[0]);
-    ctx->dsp.clear_blocks(ctx->blocks[2]); // FIXME change clear blocks to take block amount
-
     qscale = get_bits(&ctx->gb, 11);
     skip_bits1(&ctx->gb);
     //av_log(ctx->avctx, AV_LOG_DEBUG, "qscale %d\n", qscale);
 
     for (i = 0; i < 8; i++) {
+        ctx->dsp.clear_block(ctx->blocks[i]);
         dnxhd_decode_dct_block(ctx, ctx->blocks[i], i, qscale);
     }
 
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index 534b850..a79a161 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -27,67 +27,29 @@
 #include "avcodec.h"
 #include "dsputil.h"
 #include "mpegvideo.h"
-#include "dnxhddata.h"
-
-typedef struct {
-    uint16_t mb;
-    int value;
-} RCCMPEntry;
-
-typedef struct {
-    int ssd;
-    int bits;
-} RCEntry;
+#include "dnxhdenc.h"
 
 int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
 
-typedef struct DNXHDEncContext {
-    MpegEncContext m; ///< Used for quantization dsp functions
-
-    AVFrame frame;
-    int cid;
-    const CIDEntry *cid_table;
-    uint8_t *msip; ///< Macroblock Scan Indexes Payload
-    uint32_t *slice_size;
-
-    struct DNXHDEncContext *thread[MAX_THREADS];
-
-    unsigned dct_y_offset;
-    unsigned dct_uv_offset;
-    int interlaced;
-    int cur_field;
-
-    DECLARE_ALIGNED_16(DCTELEM, blocks[8][64]);
-
-    int      (*qmatrix_c)     [64];
-    int      (*qmatrix_l)     [64];
-    uint16_t (*qmatrix_l16)[2][64];
-    uint16_t (*qmatrix_c16)[2][64];
-
-    unsigned frame_bits;
-    uint8_t *src[3];
-
-    uint32_t *vlc_codes;
-    uint8_t  *vlc_bits;
-    uint16_t *run_codes;
-    uint8_t  *run_bits;
-
-    /** Rate control */
-    unsigned slice_bits;
-    unsigned qscale;
-    unsigned lambda;
-
-    unsigned thread_size;
-
-    uint16_t *mb_bits;
-    uint8_t  *mb_qscale;
-
-    RCCMPEntry *mb_cmp;
-    RCEntry   (*mb_rc)[8160];
-} DNXHDEncContext;
-
 #define LAMBDA_FRAC_BITS 10
 
+static av_always_inline void dnxhd_get_pixels_8x4(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
+{
+    int i;
+    for (i = 0; i < 4; i++) {
+        block[0] = pixels[0]; block[1] = pixels[1];
+        block[2] = pixels[2]; block[3] = pixels[3];
+        block[4] = pixels[4]; block[5] = pixels[5];
+        block[6] = pixels[6]; block[7] = pixels[7];
+        pixels += line_size;
+        block += 8;
+    }
+    memcpy(block   , block- 8, sizeof(*block)*8);
+    memcpy(block+ 8, block-16, sizeof(*block)*8);
+    memcpy(block+16, block-24, sizeof(*block)*8);
+    memcpy(block+24, block-32, sizeof(*block)*8);
+}
+
 static int dnxhd_init_vlc(DNXHDEncContext *ctx)
 {
     int i, j, level, run;
@@ -211,8 +173,13 @@ static int dnxhd_encode_init(AVCodecContext *avctx)
     ctx->m.mb_intra = 1;
     ctx->m.h263_aic = 1;
 
+    ctx->get_pixels_8x4_sym = dnxhd_get_pixels_8x4;
+
     dsputil_init(&ctx->m.dsp, avctx);
     ff_dct_common_init(&ctx->m);
+#ifdef HAVE_MMX
+    ff_dnxhd_init_mmx(ctx);
+#endif
     if (!ctx->m.dct_quantize)
         ctx->m.dct_quantize = dct_quantize_c;
 
@@ -385,27 +352,6 @@ static av_always_inline int dnxhd_calc_ac_bits(DNXHDEncContext *ctx, DCTELEM *bl
     return bits;
 }
 
-static av_always_inline void dnxhd_get_pixels_4x8(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
-{
-    int i;
-    for (i = 0; i < 4; i++) {
-        block[0] = pixels[0];
-        block[1] = pixels[1];
-        block[2] = pixels[2];
-        block[3] = pixels[3];
-        block[4] = pixels[4];
-        block[5] = pixels[5];
-        block[6] = pixels[6];
-        block[7] = pixels[7];
-        pixels += line_size;
-        block += 8;
-    }
-    memcpy(block   , block- 8, sizeof(*block)*8);
-    memcpy(block+ 8, block-16, sizeof(*block)*8);
-    memcpy(block+16, block-24, sizeof(*block)*8);
-    memcpy(block+24, block-32, sizeof(*block)*8);
-}
-
 static av_always_inline void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
 {
     const uint8_t *ptr_y = ctx->thread[0]->src[0] + ((mb_y << 4) * ctx->m.linesize)   + (mb_x << 4);
@@ -420,12 +366,14 @@ static av_always_inline void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, in
 
     if (mb_y+1 == ctx->m.mb_height && ctx->m.avctx->height == 1080) {
         if (ctx->interlaced) {
-            dnxhd_get_pixels_4x8(ctx->blocks[4], ptr_y + ctx->dct_y_offset    , ctx->m.linesize);
-            dnxhd_get_pixels_4x8(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize);
-            dnxhd_get_pixels_4x8(ctx->blocks[6], ptr_u + ctx->dct_uv_offset   , ctx->m.uvlinesize);
-            dnxhd_get_pixels_4x8(ctx->blocks[7], ptr_v + ctx->dct_uv_offset   , ctx->m.uvlinesize);
-        } else
-            memset(ctx->blocks[4], 0, 4*64*sizeof(DCTELEM));
+            ctx->get_pixels_8x4_sym(ctx->blocks[4], ptr_y + ctx->dct_y_offset    , ctx->m.linesize);
+            ctx->get_pixels_8x4_sym(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize);
+            ctx->get_pixels_8x4_sym(ctx->blocks[6], ptr_u + ctx->dct_uv_offset   , ctx->m.uvlinesize);
+            ctx->get_pixels_8x4_sym(ctx->blocks[7], ptr_v + ctx->dct_uv_offset   , ctx->m.uvlinesize);
+        } else {
+            dsp->clear_block(ctx->blocks[4]); dsp->clear_block(ctx->blocks[5]);
+            dsp->clear_block(ctx->blocks[6]); dsp->clear_block(ctx->blocks[7]);
+        }
     } else {
         dsp->get_pixels(ctx->blocks[4], ptr_y + ctx->dct_y_offset    , ctx->m.linesize);
         dsp->get_pixels(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize);
diff --git a/libavcodec/dnxhdenc.h b/libavcodec/dnxhdenc.h
new file mode 100644
index 0000000..6f9f647
--- /dev/null
+++ b/libavcodec/dnxhdenc.h
@@ -0,0 +1,90 @@
+/*
+ * VC3/DNxHD encoder structure definitions and prototypes
+ * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
+ *
+ * VC-3 encoder funded by the British Broadcasting Corporation
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DNXHDENC_H
+#define AVCODEC_DNXHDENC_H
+
+#include <stdint.h>
+#include "libavcodec/mpegvideo.h"
+#include "libavcodec/dnxhddata.h"
+
+typedef struct {
+    uint16_t mb;
+    int value;
+} RCCMPEntry;
+
+typedef struct {
+    int ssd;
+    int bits;
+} RCEntry;
+
+typedef struct DNXHDEncContext {
+    MpegEncContext m; ///< Used for quantization dsp functions
+
+    AVFrame frame;
+    int cid;
+    const CIDEntry *cid_table;
+    uint8_t *msip; ///< Macroblock Scan Indexes Payload
+    uint32_t *slice_size;
+
+    struct DNXHDEncContext *thread[MAX_THREADS];
+
+    unsigned dct_y_offset;
+    unsigned dct_uv_offset;
+    int interlaced;
+    int cur_field;
+
+    DECLARE_ALIGNED_16(DCTELEM, blocks[8][64]);
+
+    int      (*qmatrix_c)     [64];
+    int      (*qmatrix_l)     [64];
+    uint16_t (*qmatrix_l16)[2][64];
+    uint16_t (*qmatrix_c16)[2][64];
+
+    unsigned frame_bits;
+    uint8_t *src[3];
+
+    uint32_t *vlc_codes;
+    uint8_t  *vlc_bits;
+    uint16_t *run_codes;
+    uint8_t  *run_bits;
+
+    /** Rate control */
+    unsigned slice_bits;
+    unsigned qscale;
+    unsigned lambda;
+
+    unsigned thread_size;
+
+    uint16_t *mb_bits;
+    uint8_t  *mb_qscale;
+
+    RCCMPEntry *mb_cmp;
+    RCEntry   (*mb_rc)[8160];
+
+    void (*get_pixels_8x4_sym)(DCTELEM */*align 16*/, const uint8_t *, int);
+} DNXHDEncContext;
+
+void ff_dnxhd_init_mmx(DNXHDEncContext *ctx);
+
+#endif /* AVCODEC_DNXHDENC_H */
diff --git a/libavcodec/dpcm.c b/libavcodec/dpcm.c
index ff684ae..74ca9ec 100644
--- a/libavcodec/dpcm.c
+++ b/libavcodec/dpcm.c
@@ -268,7 +268,7 @@ static int dpcm_decode_frame(AVCodecContext *avctx,
                 n1 = (buf[in] >> 4) & 0xF;
                 n2 = buf[in++] & 0xF;
                 s->sample[0] += s->sol_table[n1];
-                 if (s->sample[0] < 0) s->sample[0] = 0;
+                if (s->sample[0] < 0) s->sample[0] = 0;
                 if (s->sample[0] > 255) s->sample[0] = 255;
                 output_samples[out++] = (s->sample[0] - 128) << 8;
                 s->sample[s->channels - 1] += s->sol_table[n2];
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 9a73e74..76f5dbb 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -169,7 +169,7 @@ void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_s
         int j;
         j = src_scantable[i];
         st->permutated[i] = permutation[j];
-#ifdef ARCH_POWERPC
+#ifdef ARCH_PPC
         st->inverse[j] = i;
 #endif
     }
@@ -2743,6 +2743,27 @@ void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
 /* H264 specific */
 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
 
+#if defined(CONFIG_RV30_DECODER)
+void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
+#endif /* CONFIG_RV30_DECODER */
+
+#if defined(CONFIG_RV40_DECODER)
+static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
+    put_pixels16_xy2_c(dst, src, stride, 16);
+}
+static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
+    avg_pixels16_xy2_c(dst, src, stride, 16);
+}
+static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
+    put_pixels8_xy2_c(dst, src, stride, 8);
+}
+static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
+    avg_pixels8_xy2_c(dst, src, stride, 8);
+}
+
+void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
+#endif /* CONFIG_RV40_DECODER */
+
 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
     int i;
@@ -2970,6 +2991,63 @@ static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int b
     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
 }
 
+static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
+{
+    int d;
+    for( d = 0; d < 16; d++ ) {
+        const int p2 = pix[-3*xstride];
+        const int p1 = pix[-2*xstride];
+        const int p0 = pix[-1*xstride];
+
+        const int q0 = pix[ 0*xstride];
+        const int q1 = pix[ 1*xstride];
+        const int q2 = pix[ 2*xstride];
+
+        if( FFABS( p0 - q0 ) < alpha &&
+            FFABS( p1 - p0 ) < beta &&
+            FFABS( q1 - q0 ) < beta ) {
+
+            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
+                if( FFABS( p2 - p0 ) < beta)
+                {
+                    const int p3 = pix[-4*xstride];
+                    /* p0', p1', p2' */
+                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
+                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
+                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
+                } else {
+                    /* p0' */
+                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                }
+                if( FFABS( q2 - q0 ) < beta)
+                {
+                    const int q3 = pix[3*xstride];
+                    /* q0', q1', q2' */
+                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
+                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
+                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
+                } else {
+                    /* q0' */
+                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+                }
+            }else{
+                /* p0', q0' */
+                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
+                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
+            }
+        }
+        pix += ystride;
+    }
+}
+static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
+}
+static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
+}
+
 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
 {
     int i, d;
@@ -3403,6 +3481,11 @@ void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
     }
 }
 
+static void clear_block_c(DCTELEM *block)
+{
+    memset(block, 0, sizeof(DCTELEM)*64);
+}
+
 /**
  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
  */
@@ -4259,6 +4342,10 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
         c->h264_idct8_add= ff_h264_idct8_add_c;
         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
+        c->h264_idct_add16     = ff_h264_idct_add16_c;
+        c->h264_idct8_add4     = ff_h264_idct8_add4_c;
+        c->h264_idct_add8      = ff_h264_idct_add8_c;
+        c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
     }
 
     c->get_pixels = get_pixels_c;
@@ -4271,6 +4358,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->sum_abs_dctelem = sum_abs_dctelem_c;
     c->gmc1 = gmc1_c;
     c->gmc = ff_gmc_c;
+    c->clear_block = clear_block_c;
     c->clear_blocks = clear_blocks_c;
     c->pix_sum = pix_sum_c;
     c->pix_norm1 = pix_norm1_c;
@@ -4411,6 +4499,16 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
 #if defined(CONFIG_H264_ENCODER)
     ff_h264dspenc_init(c,avctx);
 #endif
+#if defined(CONFIG_RV30_DECODER)
+    ff_rv30dsp_init(c,avctx);
+#endif
+#if defined(CONFIG_RV40_DECODER)
+    ff_rv40dsp_init(c,avctx);
+    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
+    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
+    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
+    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
+#endif
 
     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
@@ -4466,6 +4564,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
 
     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
+    c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
+    c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
@@ -4524,11 +4624,11 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
 
     if (ENABLE_MMX)      dsputil_init_mmx   (c, avctx);
-    if (ENABLE_ARMV4L)   dsputil_init_armv4l(c, avctx);
+    if (ENABLE_ARM)      dsputil_init_arm   (c, avctx);
     if (ENABLE_MLIB)     dsputil_init_mlib  (c, avctx);
     if (ENABLE_VIS)      dsputil_init_vis   (c, avctx);
     if (ENABLE_ALPHA)    dsputil_init_alpha (c, avctx);
-    if (ENABLE_POWERPC)  dsputil_init_ppc   (c, avctx);
+    if (ENABLE_PPC)      dsputil_init_ppc   (c, avctx);
     if (ENABLE_MMI)      dsputil_init_mmi   (c, avctx);
     if (ENABLE_SH4)      dsputil_init_sh4   (c, avctx);
     if (ENABLE_BFIN)     dsputil_init_bfin  (c, avctx);
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 9a3acde..88ed315 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -60,6 +60,10 @@ void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
 void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
+void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 
 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
                               const float *src2, int src3, int blocksize, int step);
@@ -169,7 +173,7 @@ typedef struct ScanTable{
     const uint8_t *scantable;
     uint8_t permutated[64];
     uint8_t raster_end[64];
-#ifdef ARCH_POWERPC
+#ifdef ARCH_PPC
                 /** Used by dct_quantize_altivec to find last-non-zero */
     DECLARE_ALIGNED(16, uint8_t, inverse[64]);
 #endif
@@ -203,6 +207,7 @@ typedef struct DSPContext {
      */
     void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
                     int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
+    void (*clear_block)(DCTELEM *block/*align 16*/);
     void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
     int (*pix_sum)(uint8_t * pix, int line_size);
     int (*pix_norm1)(uint8_t * pix, int line_size);
@@ -346,6 +351,8 @@ typedef struct DSPContext {
     void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
     void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
     /* v/h_loop_filter_luma_intra: align 16 */
+    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
+    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
     void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
     void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
     void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
@@ -435,11 +442,19 @@ typedef struct DSPContext {
 #define EDGE_WIDTH 16
 
     /* h264 functions */
+    /* NOTE!!! if you implement any of h264_idct8_add, h264_idct8_add4 then you must implement all of them
+       NOTE!!! if you implement any of h264_idct_add, h264_idct_add16, h264_idct_add16intra, h264_idct_add8 then you must implement all of them
+        The reason for above, is that no 2 out of one list may use a different permutation.
+    */
     void (*h264_idct_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
     void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
     void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
     void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
     void (*h264_dct)(DCTELEM block[4][4]);
+    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
 
     /* snow wavelet */
     void (*vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
@@ -484,6 +499,16 @@ typedef struct DSPContext {
      * @param shift number of bits to discard from product
      */
     int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
+
+    /* rv30 functions */
+    qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
+    qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
+
+    /* rv40 functions */
+    qpel_mc_func put_rv40_qpel_pixels_tab[4][16];
+    qpel_mc_func avg_rv40_qpel_pixels_tab[4][16];
+    h264_chroma_mc_func put_rv40_chroma_pixels_tab[3];
+    h264_chroma_mc_func avg_rv40_chroma_pixels_tab[3];
 } DSPContext;
 
 void dsputil_static_init(void);
@@ -547,7 +572,7 @@ static inline int get_penalty_factor(int lambda, int lambda2, int type){
 int mm_support(void);
 
 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
-void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx);
+void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx);
@@ -582,7 +607,7 @@ static inline void emms(void)
 
 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
 
-#elif defined(ARCH_ARMV4L)
+#elif defined(ARCH_ARM)
 
 extern int mm_flags;
 
@@ -591,7 +616,7 @@ extern int mm_flags;
 #   define STRIDE_ALIGN 16
 #endif
 
-#elif defined(ARCH_POWERPC)
+#elif defined(ARCH_PPC)
 
 extern int mm_flags;
 
diff --git a/libavcodec/dv.c b/libavcodec/dv.c
index a3f0511..e3f54b8 100644
--- a/libavcodec/dv.c
+++ b/libavcodec/dv.c
@@ -54,8 +54,6 @@ typedef struct DVVideoContext {
     uint8_t         *buf;
 
     uint8_t  dv_zigzag[2][64];
-    uint32_t dv_idct_factor[2][2][22][64];
-    uint32_t dv100_idct_factor[4][4][16][64];
 
     void (*get_pixels)(DCTELEM *block, const uint8_t *pixels, int line_size);
     void (*fdct[2])(DCTELEM *block);
@@ -90,65 +88,182 @@ static inline int dv_work_pool_size(const DVprofile *d)
     return size;
 }
 
-static int dv_init_dynamic_tables(const DVprofile *d)
+static inline void dv_calc_mb_coordinates(const DVprofile *d, int chan, int seq, int slot,
+                                          uint16_t *tbl)
 {
-    int j,i,c,s,p,k;
-
-    if (d->work_chunks[dv_work_pool_size(d)-1].buf_offset)
-        return 0;
-
-    p = i = 0;
-    for (c=0; c<d->n_difchan; c++) {
-        for (s=0; s<d->difseg_size; s++) {
-            p += 6;
-            for (j=0; j<27; j++) {
-                 p += !(j%3);
-                 if (!(DV_PROFILE_IS_1080i50(d) && c != 0 && s == 11) &&
-                     !(DV_PROFILE_IS_720p50(d) && s > 9)) {
-                     for (k=0; k<5; k++)
-                         d->work_chunks[i].mb_coordinates[k] = d->video_place[(c*d->difseg_size+s)*27*5 + j*5 + k];
-                     d->work_chunks[i++].buf_offset = p;
-                 }
-                 p += 5;
-            }
+    const static uint8_t off[] = { 2, 6, 8, 0, 4 };
+    const static uint8_t shuf1[] = { 36, 18, 54, 0, 72 };
+    const static uint8_t shuf2[] = { 24, 12, 36, 0, 48 };
+    const static uint8_t shuf3[] = { 18, 9, 27, 0, 36 };
+
+    const static uint8_t l_start[] = {0, 4, 9, 13, 18, 22, 27, 31, 36, 40};
+    const static uint8_t l_start_shuffled[] = { 9, 4, 13, 0, 18 };
+
+    const static uint8_t serpent1[] = {0, 1, 2, 2, 1, 0,
+                                       0, 1, 2, 2, 1, 0,
+                                       0, 1, 2, 2, 1, 0,
+                                       0, 1, 2, 2, 1, 0,
+                                       0, 1, 2};
+    const static uint8_t serpent2[] = {0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0,
+                                       0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0,
+                                       0, 1, 2, 3, 4, 5};
+
+    const static uint8_t remap[][2] = {{ 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, /* dummy */
+                                       { 0, 0}, { 0, 1}, { 0, 2}, { 0, 3}, {10, 0},
+                                       {10, 1}, {10, 2}, {10, 3}, {20, 0}, {20, 1},
+                                       {20, 2}, {20, 3}, {30, 0}, {30, 1}, {30, 2},
+                                       {30, 3}, {40, 0}, {40, 1}, {40, 2}, {40, 3},
+                                       {50, 0}, {50, 1}, {50, 2}, {50, 3}, {60, 0},
+                                       {60, 1}, {60, 2}, {60, 3}, {70, 0}, {70, 1},
+                                       {70, 2}, {70, 3}, { 0,64}, { 0,65}, { 0,66},
+                                       {10,64}, {10,65}, {10,66}, {20,64}, {20,65},
+                                       {20,66}, {30,64}, {30,65}, {30,66}, {40,64},
+                                       {40,65}, {40,66}, {50,64}, {50,65}, {50,66},
+                                       {60,64}, {60,65}, {60,66}, {70,64}, {70,65},
+                                       {70,66}, { 0,67}, {20,67}, {40,67}, {60,67}};
+
+    int i, k, m;
+    int x, y, blk;
+
+    for (m=0; m<5; m++) {
+         switch (d->width) {
+         case 1440:
+              blk = (chan*11+seq)*27+slot;
+
+              if (chan == 0 && seq == 11) {
+                  x = m*27+slot;
+                  if (x<90) {
+                      y = 0;
+                  } else {
+                      x = (x - 90)*2;
+                      y = 67;
+                  }
+              } else {
+                  i = (4*chan + blk + off[m])%11;
+                  k = (blk/11)%27;
+
+                  x = shuf1[m] + (chan&1)*9 + k%9;
+                  y = (i*3+k/9)*2 + (chan>>1) + 1;
+              }
+              tbl[m] = (x<<1)|(y<<9);
+              break;
+         case 1280:
+              blk = (chan*10+seq)*27+slot;
+
+              i = (4*chan + (seq/5) + 2*blk + off[m])%10;
+              k = (blk/5)%27;
+
+              x = shuf1[m]+(chan&1)*9 + k%9;
+              y = (i*3+k/9)*2 + (chan>>1) + 4;
+
+              if (x >= 80) {
+                  x = remap[y][0]+((x-80)<<(y>59));
+                  y = remap[y][1];
+              }
+              tbl[m] = (x<<1)|(y<<9);
+              break;
+       case 960:
+              blk = (chan*10+seq)*27+slot;
+
+              i = (4*chan + (seq/5) + 2*blk + off[m])%10;
+              k = (blk/5)%27 + (i&1)*3;
+
+              x = shuf2[m] + k%6 + 6*(chan&1);
+              y = l_start[i] + k/6 + 45*(chan>>1);
+              tbl[m] = (x<<1)|(y<<9);
+              break;
+        case 720:
+              switch (d->pix_fmt) {
+              case PIX_FMT_YUV422P:
+                   x = shuf3[m] + slot/3;
+                   y = serpent1[slot] +
+                       ((((seq + off[m]) % d->difseg_size)<<1) + chan)*3;
+                   tbl[m] = (x<<1)|(y<<8);
+                   break;
+              case PIX_FMT_YUV420P:
+                   x = shuf3[m] + slot/3;
+                   y = serpent1[slot] +
+                       ((seq + off[m]) % d->difseg_size)*3;
+                   tbl[m] = (x<<1)|(y<<9);
+                   break;
+              case PIX_FMT_YUV411P:
+                   i = (seq + off[m]) % d->difseg_size;
+                   k = slot + ((m==1||m==2)?3:0);
+
+                   x = l_start_shuffled[m] + k/6;
+                   y = serpent2[k] + i*6;
+                   if (x>21)
+                       y = y*2 - i*6;
+                   tbl[m] = (x<<2)|(y<<8);
+                   break;
+              }
+        default:
+              break;
         }
     }
-    return 0;
 }
 
-static void dv_build_unquantize_tables(DVVideoContext *s, uint8_t* perm)
+static int dv_init_dynamic_tables(const DVprofile *d)
 {
-    int i, q, a;
-
-    /* NOTE: max left shift is 6 */
-    for (q = 0; q < 22; q++) {
-        /* 88DCT */
-        i = 1;
-        for (a = 0; a < 4; a++) {
-            for (; i < dv_quant_areas[a]; i++) {
-                /* 88 table */
-                s->dv_idct_factor[0][0][q][i] = dv_iweight_88[i] << (dv_quant_shifts[q][a] + 1);
-                s->dv_idct_factor[1][0][q][i] = s->dv_idct_factor[0][0][q][i] << 1;
-
-                /* 248 table */
-                s->dv_idct_factor[0][1][q][i] = dv_iweight_248[i] << (dv_quant_shifts[q][a] + 1);
-                s->dv_idct_factor[1][1][q][i] = s->dv_idct_factor[0][1][q][i] << 1;
+    int j,i,c,s,p;
+    uint32_t *factor1, *factor2;
+    const int *iweight1, *iweight2;
+
+    if (!d->work_chunks[dv_work_pool_size(d)-1].buf_offset) {
+        p = i = 0;
+        for (c=0; c<d->n_difchan; c++) {
+            for (s=0; s<d->difseg_size; s++) {
+                p += 6;
+                for (j=0; j<27; j++) {
+                    p += !(j%3);
+                    if (!(DV_PROFILE_IS_1080i50(d) && c != 0 && s == 11) &&
+                        !(DV_PROFILE_IS_720p50(d) && s > 9)) {
+                          dv_calc_mb_coordinates(d, c, s, j, &d->work_chunks[i].mb_coordinates[0]);
+                          d->work_chunks[i++].buf_offset = p;
+                    }
+                    p += 5;
+                }
             }
         }
     }
 
-    for (a = 0; a < 4; a++) {
-        for (q = 0; q < 16; q++) {
-            for (i = 1; i < 64; i++) {
-                s->dv100_idct_factor[0][a][q][i] = (dv100_qstep[q] << (a + 9)) * dv_iweight_1080_y[i];
-                s->dv100_idct_factor[1][a][q][i] = (dv100_qstep[q] << (a + 9)) * dv_iweight_1080_c[i];
-                s->dv100_idct_factor[2][a][q][i] = (dv100_qstep[q] << (a + 9)) * dv_iweight_720_y[i];
-                s->dv100_idct_factor[3][a][q][i] = (dv100_qstep[q] << (a + 9)) * dv_iweight_720_c[i];
+    if (!d->idct_factor[DV_PROFILE_IS_HD(d)?8191:5631]) {
+        factor1 = &d->idct_factor[0];
+        factor2 = &d->idct_factor[DV_PROFILE_IS_HD(d)?4096:2816];
+        if (d->height == 720) {
+            iweight1 = &dv_iweight_720_y[0];
+            iweight2 = &dv_iweight_720_c[0];
+        } else {
+            iweight1 = &dv_iweight_1080_y[0];
+            iweight2 = &dv_iweight_1080_c[0];
+            }
+        if (DV_PROFILE_IS_HD(d)) {
+            for (c = 0; c < 4; c++) {
+                for (s = 0; s < 16; s++) {
+                    for (i = 0; i < 64; i++) {
+                        *factor1++ = (dv100_qstep[s] << (c + 9)) * iweight1[i];
+                        *factor2++ = (dv100_qstep[s] << (c + 9)) * iweight2[i];
+                    }
+                }
+            }
+        } else {
+            iweight1 = &dv_iweight_88[0];
+            for (j = 0; j < 2; j++, iweight1 = &dv_iweight_248[0]) {
+                for (s = 0; s < 22; s++) {
+                    for (i = c = 0; c < 4; c++) {
+                        for (; i < dv_quant_areas[c]; i++) {
+                            *factor1   = iweight1[i] << (dv_quant_shifts[s][c] + 1);
+                            *factor2++ = (*factor1++) << 1;
+        }
+    }
             }
         }
     }
 }
 
+    return 0;
+}
+
 static av_cold int dvvideo_init(AVCodecContext *avctx)
 {
     DVVideoContext *s = avctx->priv_data;
@@ -272,9 +387,6 @@ static av_cold int dvvideo_init(AVCodecContext *avctx)
     }else
         memcpy(s->dv_zigzag[1], ff_zigzag248_direct, 64);
 
-    /* XXX: do it only for constant case */
-    dv_build_unquantize_tables(s, dsp.idct_permutation);
-
     avctx->coded_frame = &s->picture;
     s->avctx = avctx;
 
@@ -395,8 +507,9 @@ static inline void dv_calculate_mb_xy(DVVideoContext *s, DVwork_chunk *work_chun
 }
 
 /* mb_x and mb_y are in units of 8 pixels */
-static inline void dv_decode_video_segment(DVVideoContext *s, DVwork_chunk *work_chunk)
+static int dv_decode_video_segment(AVCodecContext *avctx, DVwork_chunk *work_chunk)
 {
+    DVVideoContext *s = avctx->priv_data;
     int quant, dc, dct_mode, class1, j;
     int mb_index, mb_x, mb_y, last_index;
     int y_stride, linesize;
@@ -442,13 +555,13 @@ static inline void dv_decode_video_segment(DVVideoContext *s, DVwork_chunk *work
             if (DV_PROFILE_IS_HD(s->sys)) {
                 mb->idct_put     = s->idct_put[0];
                 mb->scan_table   = s->dv_zigzag[0];
-                mb->factor_table = s->dv100_idct_factor[((s->sys->height == 720) << 1) | (j >= 4)][class1][quant];
+                mb->factor_table = &s->sys->idct_factor[(j >= 4)*4*16*64 + class1*16*64 + quant*64];
                 is_field_mode[mb_index] |= !j && dct_mode;
             } else {
                 mb->idct_put     = s->idct_put[dct_mode && log2_blocksize == 3];
                 mb->scan_table   = s->dv_zigzag[dct_mode];
-                mb->factor_table = s->dv_idct_factor[class1 == 3][dct_mode]
-                    [quant + dv_quant_offset[class1]];
+                mb->factor_table = &s->sys->idct_factor[(class1 == 3)*2*22*64 + dct_mode*22*64 +
+                                                        (quant + dv_quant_offset[class1])*64];
             }
             dc = dc << 2;
             /* convert to unsigned because 128 is not added in the
@@ -576,6 +689,7 @@ static inline void dv_decode_video_segment(DVVideoContext *s, DVwork_chunk *work
             }
         }
     }
+    return 0;
 }
 
 #if ENABLE_SMALL
@@ -857,8 +971,9 @@ static inline void dv_guess_qnos(EncBlockInfo* blks, int* qnos)
     }
 }
 
-static inline void dv_encode_video_segment(DVVideoContext *s, DVwork_chunk *work_chunk)
+static int dv_encode_video_segment(AVCodecContext *avctx, DVwork_chunk *work_chunk)
 {
+    DVVideoContext *s = avctx->priv_data;
     int mb_index, i, j;
     int mb_x, mb_y, c_offset, linesize;
     uint8_t*  y_ptr;
@@ -1004,22 +1119,10 @@ static inline void dv_encode_video_segment(DVVideoContext *s, DVwork_chunk *work
 
     for (j = 0; j < 5 * 6; j++)
        flush_put_bits(&pbs[j]);
-}
 
-static int dv_decode_mt(AVCodecContext *avctx, void* sl)
-{
-    dv_decode_video_segment((DVVideoContext *)avctx->priv_data, (DVwork_chunk*)sl);
     return 0;
 }
 
-#ifdef CONFIG_DVVIDEO_ENCODER
-static int dv_encode_mt(AVCodecContext *avctx, void* sl)
-{
-    dv_encode_video_segment((DVVideoContext *)avctx->priv_data, (DVwork_chunk*)sl);
-    return 0;
-}
-#endif
-
 #ifdef CONFIG_DVVIDEO_DECODER
 /* NOTE: exactly one frame must be given (120000 bytes for NTSC,
    144000 bytes for PAL - or twice those for 50Mbps) */
@@ -1050,7 +1153,7 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
     s->picture.top_field_first  = 0;
 
     s->buf = buf;
-    avctx->execute(avctx, dv_decode_mt, s->sys->work_chunks, NULL,
+    avctx->execute(avctx, dv_decode_video_segment, s->sys->work_chunks, NULL,
                    dv_work_pool_size(s->sys), sizeof(DVwork_chunk));
 
     emms_c();
@@ -1203,7 +1306,7 @@ static int dvvideo_encode_frame(AVCodecContext *c, uint8_t *buf, int buf_size,
     s->picture.pict_type = FF_I_TYPE;
 
     s->buf = buf;
-    c->execute(c, dv_encode_mt, s->sys->work_chunks, NULL,
+    c->execute(c, dv_encode_video_segment, s->sys->work_chunks, NULL,
                dv_work_pool_size(s->sys), sizeof(DVwork_chunk));
 
     emms_c();
diff --git a/libavcodec/dvbsub.c b/libavcodec/dvbsub.c
index d7cb2c4..ed548e2 100644
--- a/libavcodec/dvbsub.c
+++ b/libavcodec/dvbsub.c
@@ -228,8 +228,8 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
     for (region_id = 0; region_id < h->num_rects; region_id++) {
         *q++ = region_id;
         *q++ = 0xff; /* reserved */
-        bytestream_put_be16(&q, h->rects[region_id].x); /* left pos */
-        bytestream_put_be16(&q, h->rects[region_id].y); /* top pos */
+        bytestream_put_be16(&q, h->rects[region_id]->x); /* left pos */
+        bytestream_put_be16(&q, h->rects[region_id]->y); /* top pos */
     }
 
     bytestream_put_be16(&pseg_len, q - pseg_len - 2);
@@ -239,10 +239,10 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
             /* CLUT segment */
 
-            if (h->rects[clut_id].nb_colors <= 4) {
+            if (h->rects[clut_id]->nb_colors <= 4) {
                 /* 2 bpp, some decoders do not support it correctly */
                 bpp_index = 0;
-            } else if (h->rects[clut_id].nb_colors <= 16) {
+            } else if (h->rects[clut_id]->nb_colors <= 16) {
                 /* 4 bpp, standard encoding */
                 bpp_index = 1;
             } else {
@@ -257,15 +257,16 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
             *q++ = clut_id;
             *q++ = (0 << 4) | 0xf; /* version = 0 */
 
-            for(i = 0; i < h->rects[clut_id].nb_colors; i++) {
+            for(i = 0; i < h->rects[clut_id]->nb_colors; i++) {
                 *q++ = i; /* clut_entry_id */
                 *q++ = (1 << (7 - bpp_index)) | (0xf << 1) | 1; /* 2 bits/pixel full range */
                 {
                     int a, r, g, b;
-                    a = (h->rects[clut_id].rgba_palette[i] >> 24) & 0xff;
-                    r = (h->rects[clut_id].rgba_palette[i] >> 16) & 0xff;
-                    g = (h->rects[clut_id].rgba_palette[i] >> 8) & 0xff;
-                    b = (h->rects[clut_id].rgba_palette[i] >> 0) & 0xff;
+                    uint32_t x= ((uint32_t*)h->rects[clut_id]->pict.data[1])[i];
+                    a = (x >> 24) & 0xff;
+                    r = (x >> 16) & 0xff;
+                    g = (x >>  8) & 0xff;
+                    b = (x >>  0) & 0xff;
 
                     *q++ = RGB_TO_Y_CCIR(r, g, b);
                     *q++ = RGB_TO_V_CCIR(r, g, b, 0);
@@ -282,10 +283,10 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
 
         /* region composition segment */
 
-        if (h->rects[region_id].nb_colors <= 4) {
+        if (h->rects[region_id]->nb_colors <= 4) {
             /* 2 bpp, some decoders do not support it correctly */
             bpp_index = 0;
-        } else if (h->rects[region_id].nb_colors <= 16) {
+        } else if (h->rects[region_id]->nb_colors <= 16) {
             /* 4 bpp, standard encoding */
             bpp_index = 1;
         } else {
@@ -299,8 +300,8 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
         q += 2; /* segment length */
         *q++ = region_id;
         *q++ = (s->object_version << 4) | (0 << 3) | 0x07; /* version , no fill */
-        bytestream_put_be16(&q, h->rects[region_id].w); /* region width */
-        bytestream_put_be16(&q, h->rects[region_id].h); /* region height */
+        bytestream_put_be16(&q, h->rects[region_id]->w); /* region width */
+        bytestream_put_be16(&q, h->rects[region_id]->h); /* region height */
         *q++ = ((1 + bpp_index) << 5) | ((1 + bpp_index) << 2) | 0x03;
         *q++ = region_id; /* clut_id == region_id */
         *q++ = 0; /* 8 bit fill colors */
@@ -322,10 +323,10 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
         for (object_id = 0; object_id < h->num_rects; object_id++) {
             /* Object Data segment */
 
-            if (h->rects[object_id].nb_colors <= 4) {
+            if (h->rects[object_id]->nb_colors <= 4) {
                 /* 2 bpp, some decoders do not support it correctly */
                 bpp_index = 0;
-            } else if (h->rects[object_id].nb_colors <= 16) {
+            } else if (h->rects[object_id]->nb_colors <= 16) {
                 /* 4 bpp, standard encoding */
                 bpp_index = 1;
             } else {
@@ -358,12 +359,12 @@ static int encode_dvb_subtitles(DVBSubtitleContext *s,
                     dvb_encode_rle = dvb_encode_rle4;
 
                 top_ptr = q;
-                dvb_encode_rle(&q, h->rects[object_id].bitmap, h->rects[object_id].w * 2,
-                                    h->rects[object_id].w, h->rects[object_id].h >> 1);
+                dvb_encode_rle(&q, h->rects[object_id]->pict.data[0], h->rects[object_id]->w * 2,
+                                    h->rects[object_id]->w, h->rects[object_id]->h >> 1);
                 bottom_ptr = q;
-                dvb_encode_rle(&q, h->rects[object_id].bitmap + h->rects[object_id].w,
-                                    h->rects[object_id].w * 2, h->rects[object_id].w,
-                                    h->rects[object_id].h >> 1);
+                dvb_encode_rle(&q, h->rects[object_id]->pict.data[0] + h->rects[object_id]->w,
+                                    h->rects[object_id]->w * 2, h->rects[object_id]->w,
+                                    h->rects[object_id]->h >> 1);
 
                 bytestream_put_be16(&ptop_field_len, bottom_ptr - top_ptr);
                 bytestream_put_be16(&pbottom_field_len, q - bottom_ptr);
diff --git a/libavcodec/dvbsubdec.c b/libavcodec/dvbsubdec.c
index 3f47c1b..689c068 100644
--- a/libavcodec/dvbsubdec.c
+++ b/libavcodec/dvbsubdec.c
@@ -1285,14 +1285,17 @@ static int dvbsub_display_end_segment(AVCodecContext *avctx, const uint8_t *buf,
 
     sub->num_rects = ctx->display_list_size;
 
-    if (sub->num_rects > 0)
-        sub->rects = av_mallocz(sizeof(AVSubtitleRect) * sub->num_rects);
+    if (sub->num_rects > 0){
+        sub->rects = av_mallocz(sizeof(*sub->rects) * sub->num_rects);
+        for(i=0; i<sub->num_rects; i++)
+            sub->rects[i] = av_mallocz(sizeof(*sub->rects[i]));
+    }
 
     i = 0;
 
     for (display = ctx->display_list; display; display = display->next) {
         region = get_region(ctx, display->region_id);
-        rect = &sub->rects[i];
+        rect = sub->rects[i];
 
         if (!region)
             continue;
@@ -1302,7 +1305,7 @@ static int dvbsub_display_end_segment(AVCodecContext *avctx, const uint8_t *buf,
         rect->w = region->width;
         rect->h = region->height;
         rect->nb_colors = 16;
-        rect->linesize = region->width;
+        rect->pict.linesize[0] = region->width;
 
         clut = get_clut(ctx, region->clut);
 
@@ -1322,11 +1325,11 @@ static int dvbsub_display_end_segment(AVCodecContext *avctx, const uint8_t *buf,
             break;
         }
 
-        rect->rgba_palette = av_malloc((1 << region->depth) * sizeof(uint32_t));
-        memcpy(rect->rgba_palette, clut_table, (1 << region->depth) * sizeof(uint32_t));
+        rect->pict.data[1] = av_malloc((1 << region->depth) * sizeof(uint32_t));
+        memcpy(rect->pict.data[1], clut_table, (1 << region->depth) * sizeof(uint32_t));
 
-        rect->bitmap = av_malloc(region->buf_size);
-        memcpy(rect->bitmap, region->pbuf, region->buf_size);
+        rect->pict.data[0] = av_malloc(region->buf_size);
+        memcpy(rect->pict.data[0], region->pbuf, region->buf_size);
 
         i++;
     }
diff --git a/libavcodec/dvdata.h b/libavcodec/dvdata.h
index 60feda4..f53fbea 100644
--- a/libavcodec/dvdata.h
+++ b/libavcodec/dvdata.h
@@ -53,7 +53,7 @@ typedef struct DVprofile {
     int              width;                 /* picture width in pixels */
     AVRational       sar[2];                /* sample aspect ratios for 4:3 and 16:9 */
     DVwork_chunk    *work_chunks;           /* each thread gets its own chunk of frame to work on */
-    const uint16_t  *video_place;           /* positions of all DV macroblocks */
+    uint32_t        *idct_factor;           /* set of iDCT factor tables */
     enum PixelFormat pix_fmt;               /* picture pixel format */
     int              bpm;                   /* blocks per macroblock */
     const uint8_t   *block_sizes;           /* AC block sizes, in bits */
@@ -328,5711 +328,6 @@ static const uint8_t dv100_qstep[16] = {
     2, 3, 4, 5, 6, 7, 8, 16, 18, 20, 22, 24, 28, 52
 };
 
-/* NOTE: I prefer hardcoding the positioning of DV blocks, it is
-   simpler :-) */
-
-static const uint16_t dv_place_420[1620] = {
- 0x0c24, 0x2412, 0x3036, 0x0000, 0x1848,
- 0x0e24, 0x2612, 0x3236, 0x0200, 0x1a48,
- 0x1024, 0x2812, 0x3436, 0x0400, 0x1c48,
- 0x1026, 0x2814, 0x3438, 0x0402, 0x1c4a,
- 0x0e26, 0x2614, 0x3238, 0x0202, 0x1a4a,
- 0x0c26, 0x2414, 0x3038, 0x0002, 0x184a,
- 0x0c28, 0x2416, 0x303a, 0x0004, 0x184c,
- 0x0e28, 0x2616, 0x323a, 0x0204, 0x1a4c,
- 0x1028, 0x2816, 0x343a, 0x0404, 0x1c4c,
- 0x102a, 0x2818, 0x343c, 0x0406, 0x1c4e,
- 0x0e2a, 0x2618, 0x323c, 0x0206, 0x1a4e,
- 0x0c2a, 0x2418, 0x303c, 0x0006, 0x184e,
- 0x0c2c, 0x241a, 0x303e, 0x0008, 0x1850,
- 0x0e2c, 0x261a, 0x323e, 0x0208, 0x1a50,
- 0x102c, 0x281a, 0x343e, 0x0408, 0x1c50,
- 0x102e, 0x281c, 0x3440, 0x040a, 0x1c52,
- 0x0e2e, 0x261c, 0x3240, 0x020a, 0x1a52,
- 0x0c2e, 0x241c, 0x3040, 0x000a, 0x1852,
- 0x0c30, 0x241e, 0x3042, 0x000c, 0x1854,
- 0x0e30, 0x261e, 0x3242, 0x020c, 0x1a54,
- 0x1030, 0x281e, 0x3442, 0x040c, 0x1c54,
- 0x1032, 0x2820, 0x3444, 0x040e, 0x1c56,
- 0x0e32, 0x2620, 0x3244, 0x020e, 0x1a56,
- 0x0c32, 0x2420, 0x3044, 0x000e, 0x1856,
- 0x0c34, 0x2422, 0x3046, 0x0010, 0x1858,
- 0x0e34, 0x2622, 0x3246, 0x0210, 0x1a58,
- 0x1034, 0x2822, 0x3446, 0x0410, 0x1c58,
- 0x1224, 0x2a12, 0x3636, 0x0600, 0x1e48,
- 0x1424, 0x2c12, 0x3836, 0x0800, 0x2048,
- 0x1624, 0x2e12, 0x3a36, 0x0a00, 0x2248,
- 0x1626, 0x2e14, 0x3a38, 0x0a02, 0x224a,
- 0x1426, 0x2c14, 0x3838, 0x0802, 0x204a,
- 0x1226, 0x2a14, 0x3638, 0x0602, 0x1e4a,
- 0x1228, 0x2a16, 0x363a, 0x0604, 0x1e4c,
- 0x1428, 0x2c16, 0x383a, 0x0804, 0x204c,
- 0x1628, 0x2e16, 0x3a3a, 0x0a04, 0x224c,
- 0x162a, 0x2e18, 0x3a3c, 0x0a06, 0x224e,
- 0x142a, 0x2c18, 0x383c, 0x0806, 0x204e,
- 0x122a, 0x2a18, 0x363c, 0x0606, 0x1e4e,
- 0x122c, 0x2a1a, 0x363e, 0x0608, 0x1e50,
- 0x142c, 0x2c1a, 0x383e, 0x0808, 0x2050,
- 0x162c, 0x2e1a, 0x3a3e, 0x0a08, 0x2250,
- 0x162e, 0x2e1c, 0x3a40, 0x0a0a, 0x2252,
- 0x142e, 0x2c1c, 0x3840, 0x080a, 0x2052,
- 0x122e, 0x2a1c, 0x3640, 0x060a, 0x1e52,
- 0x1230, 0x2a1e, 0x3642, 0x060c, 0x1e54,
- 0x1430, 0x2c1e, 0x3842, 0x080c, 0x2054,
- 0x1630, 0x2e1e, 0x3a42, 0x0a0c, 0x2254,
- 0x1632, 0x2e20, 0x3a44, 0x0a0e, 0x2256,
- 0x1432, 0x2c20, 0x3844, 0x080e, 0x2056,
- 0x1232, 0x2a20, 0x3644, 0x060e, 0x1e56,
- 0x1234, 0x2a22, 0x3646, 0x0610, 0x1e58,
- 0x1434, 0x2c22, 0x3846, 0x0810, 0x2058,
- 0x1634, 0x2e22, 0x3a46, 0x0a10, 0x2258,
- 0x1824, 0x3012, 0x3c36, 0x0c00, 0x2448,
- 0x1a24, 0x3212, 0x3e36, 0x0e00, 0x2648,
- 0x1c24, 0x3412, 0x4036, 0x1000, 0x2848,
- 0x1c26, 0x3414, 0x4038, 0x1002, 0x284a,
- 0x1a26, 0x3214, 0x3e38, 0x0e02, 0x264a,
- 0x1826, 0x3014, 0x3c38, 0x0c02, 0x244a,
- 0x1828, 0x3016, 0x3c3a, 0x0c04, 0x244c,
- 0x1a28, 0x3216, 0x3e3a, 0x0e04, 0x264c,
- 0x1c28, 0x3416, 0x403a, 0x1004, 0x284c,
- 0x1c2a, 0x3418, 0x403c, 0x1006, 0x284e,
- 0x1a2a, 0x3218, 0x3e3c, 0x0e06, 0x264e,
- 0x182a, 0x3018, 0x3c3c, 0x0c06, 0x244e,
- 0x182c, 0x301a, 0x3c3e, 0x0c08, 0x2450,
- 0x1a2c, 0x321a, 0x3e3e, 0x0e08, 0x2650,
- 0x1c2c, 0x341a, 0x403e, 0x1008, 0x2850,
- 0x1c2e, 0x341c, 0x4040, 0x100a, 0x2852,
- 0x1a2e, 0x321c, 0x3e40, 0x0e0a, 0x2652,
- 0x182e, 0x301c, 0x3c40, 0x0c0a, 0x2452,
- 0x1830, 0x301e, 0x3c42, 0x0c0c, 0x2454,
- 0x1a30, 0x321e, 0x3e42, 0x0e0c, 0x2654,
- 0x1c30, 0x341e, 0x4042, 0x100c, 0x2854,
- 0x1c32, 0x3420, 0x4044, 0x100e, 0x2856,
- 0x1a32, 0x3220, 0x3e44, 0x0e0e, 0x2656,
- 0x1832, 0x3020, 0x3c44, 0x0c0e, 0x2456,
- 0x1834, 0x3022, 0x3c46, 0x0c10, 0x2458,
- 0x1a34, 0x3222, 0x3e46, 0x0e10, 0x2658,
- 0x1c34, 0x3422, 0x4046, 0x1010, 0x2858,
- 0x1e24, 0x3612, 0x4236, 0x1200, 0x2a48,
- 0x2024, 0x3812, 0x4436, 0x1400, 0x2c48,
- 0x2224, 0x3a12, 0x4636, 0x1600, 0x2e48,
- 0x2226, 0x3a14, 0x4638, 0x1602, 0x2e4a,
- 0x2026, 0x3814, 0x4438, 0x1402, 0x2c4a,
- 0x1e26, 0x3614, 0x4238, 0x1202, 0x2a4a,
- 0x1e28, 0x3616, 0x423a, 0x1204, 0x2a4c,
- 0x2028, 0x3816, 0x443a, 0x1404, 0x2c4c,
- 0x2228, 0x3a16, 0x463a, 0x1604, 0x2e4c,
- 0x222a, 0x3a18, 0x463c, 0x1606, 0x2e4e,
- 0x202a, 0x3818, 0x443c, 0x1406, 0x2c4e,
- 0x1e2a, 0x3618, 0x423c, 0x1206, 0x2a4e,
- 0x1e2c, 0x361a, 0x423e, 0x1208, 0x2a50,
- 0x202c, 0x381a, 0x443e, 0x1408, 0x2c50,
- 0x222c, 0x3a1a, 0x463e, 0x1608, 0x2e50,
- 0x222e, 0x3a1c, 0x4640, 0x160a, 0x2e52,
- 0x202e, 0x381c, 0x4440, 0x140a, 0x2c52,
- 0x1e2e, 0x361c, 0x4240, 0x120a, 0x2a52,
- 0x1e30, 0x361e, 0x4242, 0x120c, 0x2a54,
- 0x2030, 0x381e, 0x4442, 0x140c, 0x2c54,
- 0x2230, 0x3a1e, 0x4642, 0x160c, 0x2e54,
- 0x2232, 0x3a20, 0x4644, 0x160e, 0x2e56,
- 0x2032, 0x3820, 0x4444, 0x140e, 0x2c56,
- 0x1e32, 0x3620, 0x4244, 0x120e, 0x2a56,
- 0x1e34, 0x3622, 0x4246, 0x1210, 0x2a58,
- 0x2034, 0x3822, 0x4446, 0x1410, 0x2c58,
- 0x2234, 0x3a22, 0x4646, 0x1610, 0x2e58,
- 0x2424, 0x3c12, 0x0036, 0x1800, 0x3048,
- 0x2624, 0x3e12, 0x0236, 0x1a00, 0x3248,
- 0x2824, 0x4012, 0x0436, 0x1c00, 0x3448,
- 0x2826, 0x4014, 0x0438, 0x1c02, 0x344a,
- 0x2626, 0x3e14, 0x0238, 0x1a02, 0x324a,
- 0x2426, 0x3c14, 0x0038, 0x1802, 0x304a,
- 0x2428, 0x3c16, 0x003a, 0x1804, 0x304c,
- 0x2628, 0x3e16, 0x023a, 0x1a04, 0x324c,
- 0x2828, 0x4016, 0x043a, 0x1c04, 0x344c,
- 0x282a, 0x4018, 0x043c, 0x1c06, 0x344e,
- 0x262a, 0x3e18, 0x023c, 0x1a06, 0x324e,
- 0x242a, 0x3c18, 0x003c, 0x1806, 0x304e,
- 0x242c, 0x3c1a, 0x003e, 0x1808, 0x3050,
- 0x262c, 0x3e1a, 0x023e, 0x1a08, 0x3250,
- 0x282c, 0x401a, 0x043e, 0x1c08, 0x3450,
- 0x282e, 0x401c, 0x0440, 0x1c0a, 0x3452,
- 0x262e, 0x3e1c, 0x0240, 0x1a0a, 0x3252,
- 0x242e, 0x3c1c, 0x0040, 0x180a, 0x3052,
- 0x2430, 0x3c1e, 0x0042, 0x180c, 0x3054,
- 0x2630, 0x3e1e, 0x0242, 0x1a0c, 0x3254,
- 0x2830, 0x401e, 0x0442, 0x1c0c, 0x3454,
- 0x2832, 0x4020, 0x0444, 0x1c0e, 0x3456,
- 0x2632, 0x3e20, 0x0244, 0x1a0e, 0x3256,
- 0x2432, 0x3c20, 0x0044, 0x180e, 0x3056,
- 0x2434, 0x3c22, 0x0046, 0x1810, 0x3058,
- 0x2634, 0x3e22, 0x0246, 0x1a10, 0x3258,
- 0x2834, 0x4022, 0x0446, 0x1c10, 0x3458,
- 0x2a24, 0x4212, 0x0636, 0x1e00, 0x3648,
- 0x2c24, 0x4412, 0x0836, 0x2000, 0x3848,
- 0x2e24, 0x4612, 0x0a36, 0x2200, 0x3a48,
- 0x2e26, 0x4614, 0x0a38, 0x2202, 0x3a4a,
- 0x2c26, 0x4414, 0x0838, 0x2002, 0x384a,
- 0x2a26, 0x4214, 0x0638, 0x1e02, 0x364a,
- 0x2a28, 0x4216, 0x063a, 0x1e04, 0x364c,
- 0x2c28, 0x4416, 0x083a, 0x2004, 0x384c,
- 0x2e28, 0x4616, 0x0a3a, 0x2204, 0x3a4c,
- 0x2e2a, 0x4618, 0x0a3c, 0x2206, 0x3a4e,
- 0x2c2a, 0x4418, 0x083c, 0x2006, 0x384e,
- 0x2a2a, 0x4218, 0x063c, 0x1e06, 0x364e,
- 0x2a2c, 0x421a, 0x063e, 0x1e08, 0x3650,
- 0x2c2c, 0x441a, 0x083e, 0x2008, 0x3850,
- 0x2e2c, 0x461a, 0x0a3e, 0x2208, 0x3a50,
- 0x2e2e, 0x461c, 0x0a40, 0x220a, 0x3a52,
- 0x2c2e, 0x441c, 0x0840, 0x200a, 0x3852,
- 0x2a2e, 0x421c, 0x0640, 0x1e0a, 0x3652,
- 0x2a30, 0x421e, 0x0642, 0x1e0c, 0x3654,
- 0x2c30, 0x441e, 0x0842, 0x200c, 0x3854,
- 0x2e30, 0x461e, 0x0a42, 0x220c, 0x3a54,
- 0x2e32, 0x4620, 0x0a44, 0x220e, 0x3a56,
- 0x2c32, 0x4420, 0x0844, 0x200e, 0x3856,
- 0x2a32, 0x4220, 0x0644, 0x1e0e, 0x3656,
- 0x2a34, 0x4222, 0x0646, 0x1e10, 0x3658,
- 0x2c34, 0x4422, 0x0846, 0x2010, 0x3858,
- 0x2e34, 0x4622, 0x0a46, 0x2210, 0x3a58,
- 0x3024, 0x0012, 0x0c36, 0x2400, 0x3c48,
- 0x3224, 0x0212, 0x0e36, 0x2600, 0x3e48,
- 0x3424, 0x0412, 0x1036, 0x2800, 0x4048,
- 0x3426, 0x0414, 0x1038, 0x2802, 0x404a,
- 0x3226, 0x0214, 0x0e38, 0x2602, 0x3e4a,
- 0x3026, 0x0014, 0x0c38, 0x2402, 0x3c4a,
- 0x3028, 0x0016, 0x0c3a, 0x2404, 0x3c4c,
- 0x3228, 0x0216, 0x0e3a, 0x2604, 0x3e4c,
- 0x3428, 0x0416, 0x103a, 0x2804, 0x404c,
- 0x342a, 0x0418, 0x103c, 0x2806, 0x404e,
- 0x322a, 0x0218, 0x0e3c, 0x2606, 0x3e4e,
- 0x302a, 0x0018, 0x0c3c, 0x2406, 0x3c4e,
- 0x302c, 0x001a, 0x0c3e, 0x2408, 0x3c50,
- 0x322c, 0x021a, 0x0e3e, 0x2608, 0x3e50,
- 0x342c, 0x041a, 0x103e, 0x2808, 0x4050,
- 0x342e, 0x041c, 0x1040, 0x280a, 0x4052,
- 0x322e, 0x021c, 0x0e40, 0x260a, 0x3e52,
- 0x302e, 0x001c, 0x0c40, 0x240a, 0x3c52,
- 0x3030, 0x001e, 0x0c42, 0x240c, 0x3c54,
- 0x3230, 0x021e, 0x0e42, 0x260c, 0x3e54,
- 0x3430, 0x041e, 0x1042, 0x280c, 0x4054,
- 0x3432, 0x0420, 0x1044, 0x280e, 0x4056,
- 0x3232, 0x0220, 0x0e44, 0x260e, 0x3e56,
- 0x3032, 0x0020, 0x0c44, 0x240e, 0x3c56,
- 0x3034, 0x0022, 0x0c46, 0x2410, 0x3c58,
- 0x3234, 0x0222, 0x0e46, 0x2610, 0x3e58,
- 0x3434, 0x0422, 0x1046, 0x2810, 0x4058,
- 0x3624, 0x0612, 0x1236, 0x2a00, 0x4248,
- 0x3824, 0x0812, 0x1436, 0x2c00, 0x4448,
- 0x3a24, 0x0a12, 0x1636, 0x2e00, 0x4648,
- 0x3a26, 0x0a14, 0x1638, 0x2e02, 0x464a,
- 0x3826, 0x0814, 0x1438, 0x2c02, 0x444a,
- 0x3626, 0x0614, 0x1238, 0x2a02, 0x424a,
- 0x3628, 0x0616, 0x123a, 0x2a04, 0x424c,
- 0x3828, 0x0816, 0x143a, 0x2c04, 0x444c,
- 0x3a28, 0x0a16, 0x163a, 0x2e04, 0x464c,
- 0x3a2a, 0x0a18, 0x163c, 0x2e06, 0x464e,
- 0x382a, 0x0818, 0x143c, 0x2c06, 0x444e,
- 0x362a, 0x0618, 0x123c, 0x2a06, 0x424e,
- 0x362c, 0x061a, 0x123e, 0x2a08, 0x4250,
- 0x382c, 0x081a, 0x143e, 0x2c08, 0x4450,
- 0x3a2c, 0x0a1a, 0x163e, 0x2e08, 0x4650,
- 0x3a2e, 0x0a1c, 0x1640, 0x2e0a, 0x4652,
- 0x382e, 0x081c, 0x1440, 0x2c0a, 0x4452,
- 0x362e, 0x061c, 0x1240, 0x2a0a, 0x4252,
- 0x3630, 0x061e, 0x1242, 0x2a0c, 0x4254,
- 0x3830, 0x081e, 0x1442, 0x2c0c, 0x4454,
- 0x3a30, 0x0a1e, 0x1642, 0x2e0c, 0x4654,
- 0x3a32, 0x0a20, 0x1644, 0x2e0e, 0x4656,
- 0x3832, 0x0820, 0x1444, 0x2c0e, 0x4456,
- 0x3632, 0x0620, 0x1244, 0x2a0e, 0x4256,
- 0x3634, 0x0622, 0x1246, 0x2a10, 0x4258,
- 0x3834, 0x0822, 0x1446, 0x2c10, 0x4458,
- 0x3a34, 0x0a22, 0x1646, 0x2e10, 0x4658,
- 0x3c24, 0x0c12, 0x1836, 0x3000, 0x0048,
- 0x3e24, 0x0e12, 0x1a36, 0x3200, 0x0248,
- 0x4024, 0x1012, 0x1c36, 0x3400, 0x0448,
- 0x4026, 0x1014, 0x1c38, 0x3402, 0x044a,
- 0x3e26, 0x0e14, 0x1a38, 0x3202, 0x024a,
- 0x3c26, 0x0c14, 0x1838, 0x3002, 0x004a,
- 0x3c28, 0x0c16, 0x183a, 0x3004, 0x004c,
- 0x3e28, 0x0e16, 0x1a3a, 0x3204, 0x024c,
- 0x4028, 0x1016, 0x1c3a, 0x3404, 0x044c,
- 0x402a, 0x1018, 0x1c3c, 0x3406, 0x044e,
- 0x3e2a, 0x0e18, 0x1a3c, 0x3206, 0x024e,
- 0x3c2a, 0x0c18, 0x183c, 0x3006, 0x004e,
- 0x3c2c, 0x0c1a, 0x183e, 0x3008, 0x0050,
- 0x3e2c, 0x0e1a, 0x1a3e, 0x3208, 0x0250,
- 0x402c, 0x101a, 0x1c3e, 0x3408, 0x0450,
- 0x402e, 0x101c, 0x1c40, 0x340a, 0x0452,
- 0x3e2e, 0x0e1c, 0x1a40, 0x320a, 0x0252,
- 0x3c2e, 0x0c1c, 0x1840, 0x300a, 0x0052,
- 0x3c30, 0x0c1e, 0x1842, 0x300c, 0x0054,
- 0x3e30, 0x0e1e, 0x1a42, 0x320c, 0x0254,
- 0x4030, 0x101e, 0x1c42, 0x340c, 0x0454,
- 0x4032, 0x1020, 0x1c44, 0x340e, 0x0456,
- 0x3e32, 0x0e20, 0x1a44, 0x320e, 0x0256,
- 0x3c32, 0x0c20, 0x1844, 0x300e, 0x0056,
- 0x3c34, 0x0c22, 0x1846, 0x3010, 0x0058,
- 0x3e34, 0x0e22, 0x1a46, 0x3210, 0x0258,
- 0x4034, 0x1022, 0x1c46, 0x3410, 0x0458,
- 0x4224, 0x1212, 0x1e36, 0x3600, 0x0648,
- 0x4424, 0x1412, 0x2036, 0x3800, 0x0848,
- 0x4624, 0x1612, 0x2236, 0x3a00, 0x0a48,
- 0x4626, 0x1614, 0x2238, 0x3a02, 0x0a4a,
- 0x4426, 0x1414, 0x2038, 0x3802, 0x084a,
- 0x4226, 0x1214, 0x1e38, 0x3602, 0x064a,
- 0x4228, 0x1216, 0x1e3a, 0x3604, 0x064c,
- 0x4428, 0x1416, 0x203a, 0x3804, 0x084c,
- 0x4628, 0x1616, 0x223a, 0x3a04, 0x0a4c,
- 0x462a, 0x1618, 0x223c, 0x3a06, 0x0a4e,
- 0x442a, 0x1418, 0x203c, 0x3806, 0x084e,
- 0x422a, 0x1218, 0x1e3c, 0x3606, 0x064e,
- 0x422c, 0x121a, 0x1e3e, 0x3608, 0x0650,
- 0x442c, 0x141a, 0x203e, 0x3808, 0x0850,
- 0x462c, 0x161a, 0x223e, 0x3a08, 0x0a50,
- 0x462e, 0x161c, 0x2240, 0x3a0a, 0x0a52,
- 0x442e, 0x141c, 0x2040, 0x380a, 0x0852,
- 0x422e, 0x121c, 0x1e40, 0x360a, 0x0652,
- 0x4230, 0x121e, 0x1e42, 0x360c, 0x0654,
- 0x4430, 0x141e, 0x2042, 0x380c, 0x0854,
- 0x4630, 0x161e, 0x2242, 0x3a0c, 0x0a54,
- 0x4632, 0x1620, 0x2244, 0x3a0e, 0x0a56,
- 0x4432, 0x1420, 0x2044, 0x380e, 0x0856,
- 0x4232, 0x1220, 0x1e44, 0x360e, 0x0656,
- 0x4234, 0x1222, 0x1e46, 0x3610, 0x0658,
- 0x4434, 0x1422, 0x2046, 0x3810, 0x0858,
- 0x4634, 0x1622, 0x2246, 0x3a10, 0x0a58,
- 0x0024, 0x1812, 0x2436, 0x3c00, 0x0c48,
- 0x0224, 0x1a12, 0x2636, 0x3e00, 0x0e48,
- 0x0424, 0x1c12, 0x2836, 0x4000, 0x1048,
- 0x0426, 0x1c14, 0x2838, 0x4002, 0x104a,
- 0x0226, 0x1a14, 0x2638, 0x3e02, 0x0e4a,
- 0x0026, 0x1814, 0x2438, 0x3c02, 0x0c4a,
- 0x0028, 0x1816, 0x243a, 0x3c04, 0x0c4c,
- 0x0228, 0x1a16, 0x263a, 0x3e04, 0x0e4c,
- 0x0428, 0x1c16, 0x283a, 0x4004, 0x104c,
- 0x042a, 0x1c18, 0x283c, 0x4006, 0x104e,
- 0x022a, 0x1a18, 0x263c, 0x3e06, 0x0e4e,
- 0x002a, 0x1818, 0x243c, 0x3c06, 0x0c4e,
- 0x002c, 0x181a, 0x243e, 0x3c08, 0x0c50,
- 0x022c, 0x1a1a, 0x263e, 0x3e08, 0x0e50,
- 0x042c, 0x1c1a, 0x283e, 0x4008, 0x1050,
- 0x042e, 0x1c1c, 0x2840, 0x400a, 0x1052,
- 0x022e, 0x1a1c, 0x2640, 0x3e0a, 0x0e52,
- 0x002e, 0x181c, 0x2440, 0x3c0a, 0x0c52,
- 0x0030, 0x181e, 0x2442, 0x3c0c, 0x0c54,
- 0x0230, 0x1a1e, 0x2642, 0x3e0c, 0x0e54,
- 0x0430, 0x1c1e, 0x2842, 0x400c, 0x1054,
- 0x0432, 0x1c20, 0x2844, 0x400e, 0x1056,
- 0x0232, 0x1a20, 0x2644, 0x3e0e, 0x0e56,
- 0x0032, 0x1820, 0x2444, 0x3c0e, 0x0c56,
- 0x0034, 0x1822, 0x2446, 0x3c10, 0x0c58,
- 0x0234, 0x1a22, 0x2646, 0x3e10, 0x0e58,
- 0x0434, 0x1c22, 0x2846, 0x4010, 0x1058,
- 0x0624, 0x1e12, 0x2a36, 0x4200, 0x1248,
- 0x0824, 0x2012, 0x2c36, 0x4400, 0x1448,
- 0x0a24, 0x2212, 0x2e36, 0x4600, 0x1648,
- 0x0a26, 0x2214, 0x2e38, 0x4602, 0x164a,
- 0x0826, 0x2014, 0x2c38, 0x4402, 0x144a,
- 0x0626, 0x1e14, 0x2a38, 0x4202, 0x124a,
- 0x0628, 0x1e16, 0x2a3a, 0x4204, 0x124c,
- 0x0828, 0x2016, 0x2c3a, 0x4404, 0x144c,
- 0x0a28, 0x2216, 0x2e3a, 0x4604, 0x164c,
- 0x0a2a, 0x2218, 0x2e3c, 0x4606, 0x164e,
- 0x082a, 0x2018, 0x2c3c, 0x4406, 0x144e,
- 0x062a, 0x1e18, 0x2a3c, 0x4206, 0x124e,
- 0x062c, 0x1e1a, 0x2a3e, 0x4208, 0x1250,
- 0x082c, 0x201a, 0x2c3e, 0x4408, 0x1450,
- 0x0a2c, 0x221a, 0x2e3e, 0x4608, 0x1650,
- 0x0a2e, 0x221c, 0x2e40, 0x460a, 0x1652,
- 0x082e, 0x201c, 0x2c40, 0x440a, 0x1452,
- 0x062e, 0x1e1c, 0x2a40, 0x420a, 0x1252,
- 0x0630, 0x1e1e, 0x2a42, 0x420c, 0x1254,
- 0x0830, 0x201e, 0x2c42, 0x440c, 0x1454,
- 0x0a30, 0x221e, 0x2e42, 0x460c, 0x1654,
- 0x0a32, 0x2220, 0x2e44, 0x460e, 0x1656,
- 0x0832, 0x2020, 0x2c44, 0x440e, 0x1456,
- 0x0632, 0x1e20, 0x2a44, 0x420e, 0x1256,
- 0x0634, 0x1e22, 0x2a46, 0x4210, 0x1258,
- 0x0834, 0x2022, 0x2c46, 0x4410, 0x1458,
- 0x0a34, 0x2222, 0x2e46, 0x4610, 0x1658,
-};
-
-static const uint16_t dv_place_411P[1620] = {
- 0x0c24, 0x2710, 0x3334, 0x0000, 0x1848,
- 0x0d24, 0x2810, 0x3434, 0x0100, 0x1948,
- 0x0e24, 0x2910, 0x3534, 0x0200, 0x1a48,
- 0x0f24, 0x2914, 0x3538, 0x0300, 0x1b48,
- 0x1024, 0x2814, 0x3438, 0x0400, 0x1c48,
- 0x1124, 0x2714, 0x3338, 0x0500, 0x1d48,
- 0x1128, 0x2614, 0x3238, 0x0504, 0x1d4c,
- 0x1028, 0x2514, 0x3138, 0x0404, 0x1c4c,
- 0x0f28, 0x2414, 0x3038, 0x0304, 0x1b4c,
- 0x0e28, 0x2418, 0x303c, 0x0204, 0x1a4c,
- 0x0d28, 0x2518, 0x313c, 0x0104, 0x194c,
- 0x0c28, 0x2618, 0x323c, 0x0004, 0x184c,
- 0x0c2c, 0x2718, 0x333c, 0x0008, 0x1850,
- 0x0d2c, 0x2818, 0x343c, 0x0108, 0x1950,
- 0x0e2c, 0x2918, 0x353c, 0x0208, 0x1a50,
- 0x0f2c, 0x291c, 0x3540, 0x0308, 0x1b50,
- 0x102c, 0x281c, 0x3440, 0x0408, 0x1c50,
- 0x112c, 0x271c, 0x3340, 0x0508, 0x1d50,
- 0x1130, 0x261c, 0x3240, 0x050c, 0x1d54,
- 0x1030, 0x251c, 0x3140, 0x040c, 0x1c54,
- 0x0f30, 0x241c, 0x3040, 0x030c, 0x1b54,
- 0x0e30, 0x2420, 0x3044, 0x020c, 0x1a54,
- 0x0d30, 0x2520, 0x3144, 0x010c, 0x1954,
- 0x0c30, 0x2620, 0x3244, 0x000c, 0x1854,
- 0x0c34, 0x2720, 0x3344, 0x0010, 0x1858,
- 0x0d34, 0x2820, 0x3444, 0x0110, 0x1a58,
- 0x0e34, 0x2920, 0x3544, 0x0210, 0x1c58,
- 0x1224, 0x2d10, 0x3934, 0x0600, 0x1e48,
- 0x1324, 0x2e10, 0x3a34, 0x0700, 0x1f48,
- 0x1424, 0x2f10, 0x3b34, 0x0800, 0x2048,
- 0x1524, 0x2f14, 0x3b38, 0x0900, 0x2148,
- 0x1624, 0x2e14, 0x3a38, 0x0a00, 0x2248,
- 0x1724, 0x2d14, 0x3938, 0x0b00, 0x2348,
- 0x1728, 0x2c14, 0x3838, 0x0b04, 0x234c,
- 0x1628, 0x2b14, 0x3738, 0x0a04, 0x224c,
- 0x1528, 0x2a14, 0x3638, 0x0904, 0x214c,
- 0x1428, 0x2a18, 0x363c, 0x0804, 0x204c,
- 0x1328, 0x2b18, 0x373c, 0x0704, 0x1f4c,
- 0x1228, 0x2c18, 0x383c, 0x0604, 0x1e4c,
- 0x122c, 0x2d18, 0x393c, 0x0608, 0x1e50,
- 0x132c, 0x2e18, 0x3a3c, 0x0708, 0x1f50,
- 0x142c, 0x2f18, 0x3b3c, 0x0808, 0x2050,
- 0x152c, 0x2f1c, 0x3b40, 0x0908, 0x2150,
- 0x162c, 0x2e1c, 0x3a40, 0x0a08, 0x2250,
- 0x172c, 0x2d1c, 0x3940, 0x0b08, 0x2350,
- 0x1730, 0x2c1c, 0x3840, 0x0b0c, 0x2354,
- 0x1630, 0x2b1c, 0x3740, 0x0a0c, 0x2254,
- 0x1530, 0x2a1c, 0x3640, 0x090c, 0x2154,
- 0x1430, 0x2a20, 0x3644, 0x080c, 0x2054,
- 0x1330, 0x2b20, 0x3744, 0x070c, 0x1f54,
- 0x1230, 0x2c20, 0x3844, 0x060c, 0x1e54,
- 0x1234, 0x2d20, 0x3944, 0x0610, 0x1e58,
- 0x1334, 0x2e20, 0x3a44, 0x0710, 0x2058,
- 0x1434, 0x2f20, 0x3b44, 0x0810, 0x2258,
- 0x1824, 0x3310, 0x3f34, 0x0c00, 0x2448,
- 0x1924, 0x3410, 0x4034, 0x0d00, 0x2548,
- 0x1a24, 0x3510, 0x4134, 0x0e00, 0x2648,
- 0x1b24, 0x3514, 0x4138, 0x0f00, 0x2748,
- 0x1c24, 0x3414, 0x4038, 0x1000, 0x2848,
- 0x1d24, 0x3314, 0x3f38, 0x1100, 0x2948,
- 0x1d28, 0x3214, 0x3e38, 0x1104, 0x294c,
- 0x1c28, 0x3114, 0x3d38, 0x1004, 0x284c,
- 0x1b28, 0x3014, 0x3c38, 0x0f04, 0x274c,
- 0x1a28, 0x3018, 0x3c3c, 0x0e04, 0x264c,
- 0x1928, 0x3118, 0x3d3c, 0x0d04, 0x254c,
- 0x1828, 0x3218, 0x3e3c, 0x0c04, 0x244c,
- 0x182c, 0x3318, 0x3f3c, 0x0c08, 0x2450,
- 0x192c, 0x3418, 0x403c, 0x0d08, 0x2550,
- 0x1a2c, 0x3518, 0x413c, 0x0e08, 0x2650,
- 0x1b2c, 0x351c, 0x4140, 0x0f08, 0x2750,
- 0x1c2c, 0x341c, 0x4040, 0x1008, 0x2850,
- 0x1d2c, 0x331c, 0x3f40, 0x1108, 0x2950,
- 0x1d30, 0x321c, 0x3e40, 0x110c, 0x2954,
- 0x1c30, 0x311c, 0x3d40, 0x100c, 0x2854,
- 0x1b30, 0x301c, 0x3c40, 0x0f0c, 0x2754,
- 0x1a30, 0x3020, 0x3c44, 0x0e0c, 0x2654,
- 0x1930, 0x3120, 0x3d44, 0x0d0c, 0x2554,
- 0x1830, 0x3220, 0x3e44, 0x0c0c, 0x2454,
- 0x1834, 0x3320, 0x3f44, 0x0c10, 0x2458,
- 0x1934, 0x3420, 0x4044, 0x0d10, 0x2658,
- 0x1a34, 0x3520, 0x4144, 0x0e10, 0x2858,
- 0x1e24, 0x3910, 0x4534, 0x1200, 0x2a48,
- 0x1f24, 0x3a10, 0x4634, 0x1300, 0x2b48,
- 0x2024, 0x3b10, 0x4734, 0x1400, 0x2c48,
- 0x2124, 0x3b14, 0x4738, 0x1500, 0x2d48,
- 0x2224, 0x3a14, 0x4638, 0x1600, 0x2e48,
- 0x2324, 0x3914, 0x4538, 0x1700, 0x2f48,
- 0x2328, 0x3814, 0x4438, 0x1704, 0x2f4c,
- 0x2228, 0x3714, 0x4338, 0x1604, 0x2e4c,
- 0x2128, 0x3614, 0x4238, 0x1504, 0x2d4c,
- 0x2028, 0x3618, 0x423c, 0x1404, 0x2c4c,
- 0x1f28, 0x3718, 0x433c, 0x1304, 0x2b4c,
- 0x1e28, 0x3818, 0x443c, 0x1204, 0x2a4c,
- 0x1e2c, 0x3918, 0x453c, 0x1208, 0x2a50,
- 0x1f2c, 0x3a18, 0x463c, 0x1308, 0x2b50,
- 0x202c, 0x3b18, 0x473c, 0x1408, 0x2c50,
- 0x212c, 0x3b1c, 0x4740, 0x1508, 0x2d50,
- 0x222c, 0x3a1c, 0x4640, 0x1608, 0x2e50,
- 0x232c, 0x391c, 0x4540, 0x1708, 0x2f50,
- 0x2330, 0x381c, 0x4440, 0x170c, 0x2f54,
- 0x2230, 0x371c, 0x4340, 0x160c, 0x2e54,
- 0x2130, 0x361c, 0x4240, 0x150c, 0x2d54,
- 0x2030, 0x3620, 0x4244, 0x140c, 0x2c54,
- 0x1f30, 0x3720, 0x4344, 0x130c, 0x2b54,
- 0x1e30, 0x3820, 0x4444, 0x120c, 0x2a54,
- 0x1e34, 0x3920, 0x4544, 0x1210, 0x2a58,
- 0x1f34, 0x3a20, 0x4644, 0x1310, 0x2c58,
- 0x2034, 0x3b20, 0x4744, 0x1410, 0x2e58,
- 0x2424, 0x3f10, 0x0334, 0x1800, 0x3048,
- 0x2524, 0x4010, 0x0434, 0x1900, 0x3148,
- 0x2624, 0x4110, 0x0534, 0x1a00, 0x3248,
- 0x2724, 0x4114, 0x0538, 0x1b00, 0x3348,
- 0x2824, 0x4014, 0x0438, 0x1c00, 0x3448,
- 0x2924, 0x3f14, 0x0338, 0x1d00, 0x3548,
- 0x2928, 0x3e14, 0x0238, 0x1d04, 0x354c,
- 0x2828, 0x3d14, 0x0138, 0x1c04, 0x344c,
- 0x2728, 0x3c14, 0x0038, 0x1b04, 0x334c,
- 0x2628, 0x3c18, 0x003c, 0x1a04, 0x324c,
- 0x2528, 0x3d18, 0x013c, 0x1904, 0x314c,
- 0x2428, 0x3e18, 0x023c, 0x1804, 0x304c,
- 0x242c, 0x3f18, 0x033c, 0x1808, 0x3050,
- 0x252c, 0x4018, 0x043c, 0x1908, 0x3150,
- 0x262c, 0x4118, 0x053c, 0x1a08, 0x3250,
- 0x272c, 0x411c, 0x0540, 0x1b08, 0x3350,
- 0x282c, 0x401c, 0x0440, 0x1c08, 0x3450,
- 0x292c, 0x3f1c, 0x0340, 0x1d08, 0x3550,
- 0x2930, 0x3e1c, 0x0240, 0x1d0c, 0x3554,
- 0x2830, 0x3d1c, 0x0140, 0x1c0c, 0x3454,
- 0x2730, 0x3c1c, 0x0040, 0x1b0c, 0x3354,
- 0x2630, 0x3c20, 0x0044, 0x1a0c, 0x3254,
- 0x2530, 0x3d20, 0x0144, 0x190c, 0x3154,
- 0x2430, 0x3e20, 0x0244, 0x180c, 0x3054,
- 0x2434, 0x3f20, 0x0344, 0x1810, 0x3058,
- 0x2534, 0x4020, 0x0444, 0x1910, 0x3258,
- 0x2634, 0x4120, 0x0544, 0x1a10, 0x3458,
- 0x2a24, 0x4510, 0x0934, 0x1e00, 0x3648,
- 0x2b24, 0x4610, 0x0a34, 0x1f00, 0x3748,
- 0x2c24, 0x4710, 0x0b34, 0x2000, 0x3848,
- 0x2d24, 0x4714, 0x0b38, 0x2100, 0x3948,
- 0x2e24, 0x4614, 0x0a38, 0x2200, 0x3a48,
- 0x2f24, 0x4514, 0x0938, 0x2300, 0x3b48,
- 0x2f28, 0x4414, 0x0838, 0x2304, 0x3b4c,
- 0x2e28, 0x4314, 0x0738, 0x2204, 0x3a4c,
- 0x2d28, 0x4214, 0x0638, 0x2104, 0x394c,
- 0x2c28, 0x4218, 0x063c, 0x2004, 0x384c,
- 0x2b28, 0x4318, 0x073c, 0x1f04, 0x374c,
- 0x2a28, 0x4418, 0x083c, 0x1e04, 0x364c,
- 0x2a2c, 0x4518, 0x093c, 0x1e08, 0x3650,
- 0x2b2c, 0x4618, 0x0a3c, 0x1f08, 0x3750,
- 0x2c2c, 0x4718, 0x0b3c, 0x2008, 0x3850,
- 0x2d2c, 0x471c, 0x0b40, 0x2108, 0x3950,
- 0x2e2c, 0x461c, 0x0a40, 0x2208, 0x3a50,
- 0x2f2c, 0x451c, 0x0940, 0x2308, 0x3b50,
- 0x2f30, 0x441c, 0x0840, 0x230c, 0x3b54,
- 0x2e30, 0x431c, 0x0740, 0x220c, 0x3a54,
- 0x2d30, 0x421c, 0x0640, 0x210c, 0x3954,
- 0x2c30, 0x4220, 0x0644, 0x200c, 0x3854,
- 0x2b30, 0x4320, 0x0744, 0x1f0c, 0x3754,
- 0x2a30, 0x4420, 0x0844, 0x1e0c, 0x3654,
- 0x2a34, 0x4520, 0x0944, 0x1e10, 0x3658,
- 0x2b34, 0x4620, 0x0a44, 0x1f10, 0x3858,
- 0x2c34, 0x4720, 0x0b44, 0x2010, 0x3a58,
- 0x3024, 0x0310, 0x0f34, 0x2400, 0x3c48,
- 0x3124, 0x0410, 0x1034, 0x2500, 0x3d48,
- 0x3224, 0x0510, 0x1134, 0x2600, 0x3e48,
- 0x3324, 0x0514, 0x1138, 0x2700, 0x3f48,
- 0x3424, 0x0414, 0x1038, 0x2800, 0x4048,
- 0x3524, 0x0314, 0x0f38, 0x2900, 0x4148,
- 0x3528, 0x0214, 0x0e38, 0x2904, 0x414c,
- 0x3428, 0x0114, 0x0d38, 0x2804, 0x404c,
- 0x3328, 0x0014, 0x0c38, 0x2704, 0x3f4c,
- 0x3228, 0x0018, 0x0c3c, 0x2604, 0x3e4c,
- 0x3128, 0x0118, 0x0d3c, 0x2504, 0x3d4c,
- 0x3028, 0x0218, 0x0e3c, 0x2404, 0x3c4c,
- 0x302c, 0x0318, 0x0f3c, 0x2408, 0x3c50,
- 0x312c, 0x0418, 0x103c, 0x2508, 0x3d50,
- 0x322c, 0x0518, 0x113c, 0x2608, 0x3e50,
- 0x332c, 0x051c, 0x1140, 0x2708, 0x3f50,
- 0x342c, 0x041c, 0x1040, 0x2808, 0x4050,
- 0x352c, 0x031c, 0x0f40, 0x2908, 0x4150,
- 0x3530, 0x021c, 0x0e40, 0x290c, 0x4154,
- 0x3430, 0x011c, 0x0d40, 0x280c, 0x4054,
- 0x3330, 0x001c, 0x0c40, 0x270c, 0x3f54,
- 0x3230, 0x0020, 0x0c44, 0x260c, 0x3e54,
- 0x3130, 0x0120, 0x0d44, 0x250c, 0x3d54,
- 0x3030, 0x0220, 0x0e44, 0x240c, 0x3c54,
- 0x3034, 0x0320, 0x0f44, 0x2410, 0x3c58,
- 0x3134, 0x0420, 0x1044, 0x2510, 0x3e58,
- 0x3234, 0x0520, 0x1144, 0x2610, 0x4058,
- 0x3624, 0x0910, 0x1534, 0x2a00, 0x4248,
- 0x3724, 0x0a10, 0x1634, 0x2b00, 0x4348,
- 0x3824, 0x0b10, 0x1734, 0x2c00, 0x4448,
- 0x3924, 0x0b14, 0x1738, 0x2d00, 0x4548,
- 0x3a24, 0x0a14, 0x1638, 0x2e00, 0x4648,
- 0x3b24, 0x0914, 0x1538, 0x2f00, 0x4748,
- 0x3b28, 0x0814, 0x1438, 0x2f04, 0x474c,
- 0x3a28, 0x0714, 0x1338, 0x2e04, 0x464c,
- 0x3928, 0x0614, 0x1238, 0x2d04, 0x454c,
- 0x3828, 0x0618, 0x123c, 0x2c04, 0x444c,
- 0x3728, 0x0718, 0x133c, 0x2b04, 0x434c,
- 0x3628, 0x0818, 0x143c, 0x2a04, 0x424c,
- 0x362c, 0x0918, 0x153c, 0x2a08, 0x4250,
- 0x372c, 0x0a18, 0x163c, 0x2b08, 0x4350,
- 0x382c, 0x0b18, 0x173c, 0x2c08, 0x4450,
- 0x392c, 0x0b1c, 0x1740, 0x2d08, 0x4550,
- 0x3a2c, 0x0a1c, 0x1640, 0x2e08, 0x4650,
- 0x3b2c, 0x091c, 0x1540, 0x2f08, 0x4750,
- 0x3b30, 0x081c, 0x1440, 0x2f0c, 0x4754,
- 0x3a30, 0x071c, 0x1340, 0x2e0c, 0x4654,
- 0x3930, 0x061c, 0x1240, 0x2d0c, 0x4554,
- 0x3830, 0x0620, 0x1244, 0x2c0c, 0x4454,
- 0x3730, 0x0720, 0x1344, 0x2b0c, 0x4354,
- 0x3630, 0x0820, 0x1444, 0x2a0c, 0x4254,
- 0x3634, 0x0920, 0x1544, 0x2a10, 0x4258,
- 0x3734, 0x0a20, 0x1644, 0x2b10, 0x4458,
- 0x3834, 0x0b20, 0x1744, 0x2c10, 0x4658,
- 0x3c24, 0x0f10, 0x1b34, 0x3000, 0x0048,
- 0x3d24, 0x1010, 0x1c34, 0x3100, 0x0148,
- 0x3e24, 0x1110, 0x1d34, 0x3200, 0x0248,
- 0x3f24, 0x1114, 0x1d38, 0x3300, 0x0348,
- 0x4024, 0x1014, 0x1c38, 0x3400, 0x0448,
- 0x4124, 0x0f14, 0x1b38, 0x3500, 0x0548,
- 0x4128, 0x0e14, 0x1a38, 0x3504, 0x054c,
- 0x4028, 0x0d14, 0x1938, 0x3404, 0x044c,
- 0x3f28, 0x0c14, 0x1838, 0x3304, 0x034c,
- 0x3e28, 0x0c18, 0x183c, 0x3204, 0x024c,
- 0x3d28, 0x0d18, 0x193c, 0x3104, 0x014c,
- 0x3c28, 0x0e18, 0x1a3c, 0x3004, 0x004c,
- 0x3c2c, 0x0f18, 0x1b3c, 0x3008, 0x0050,
- 0x3d2c, 0x1018, 0x1c3c, 0x3108, 0x0150,
- 0x3e2c, 0x1118, 0x1d3c, 0x3208, 0x0250,
- 0x3f2c, 0x111c, 0x1d40, 0x3308, 0x0350,
- 0x402c, 0x101c, 0x1c40, 0x3408, 0x0450,
- 0x412c, 0x0f1c, 0x1b40, 0x3508, 0x0550,
- 0x4130, 0x0e1c, 0x1a40, 0x350c, 0x0554,
- 0x4030, 0x0d1c, 0x1940, 0x340c, 0x0454,
- 0x3f30, 0x0c1c, 0x1840, 0x330c, 0x0354,
- 0x3e30, 0x0c20, 0x1844, 0x320c, 0x0254,
- 0x3d30, 0x0d20, 0x1944, 0x310c, 0x0154,
- 0x3c30, 0x0e20, 0x1a44, 0x300c, 0x0054,
- 0x3c34, 0x0f20, 0x1b44, 0x3010, 0x0058,
- 0x3d34, 0x1020, 0x1c44, 0x3110, 0x0258,
- 0x3e34, 0x1120, 0x1d44, 0x3210, 0x0458,
- 0x4224, 0x1510, 0x2134, 0x3600, 0x0648,
- 0x4324, 0x1610, 0x2234, 0x3700, 0x0748,
- 0x4424, 0x1710, 0x2334, 0x3800, 0x0848,
- 0x4524, 0x1714, 0x2338, 0x3900, 0x0948,
- 0x4624, 0x1614, 0x2238, 0x3a00, 0x0a48,
- 0x4724, 0x1514, 0x2138, 0x3b00, 0x0b48,
- 0x4728, 0x1414, 0x2038, 0x3b04, 0x0b4c,
- 0x4628, 0x1314, 0x1f38, 0x3a04, 0x0a4c,
- 0x4528, 0x1214, 0x1e38, 0x3904, 0x094c,
- 0x4428, 0x1218, 0x1e3c, 0x3804, 0x084c,
- 0x4328, 0x1318, 0x1f3c, 0x3704, 0x074c,
- 0x4228, 0x1418, 0x203c, 0x3604, 0x064c,
- 0x422c, 0x1518, 0x213c, 0x3608, 0x0650,
- 0x432c, 0x1618, 0x223c, 0x3708, 0x0750,
- 0x442c, 0x1718, 0x233c, 0x3808, 0x0850,
- 0x452c, 0x171c, 0x2340, 0x3908, 0x0950,
- 0x462c, 0x161c, 0x2240, 0x3a08, 0x0a50,
- 0x472c, 0x151c, 0x2140, 0x3b08, 0x0b50,
- 0x4730, 0x141c, 0x2040, 0x3b0c, 0x0b54,
- 0x4630, 0x131c, 0x1f40, 0x3a0c, 0x0a54,
- 0x4530, 0x121c, 0x1e40, 0x390c, 0x0954,
- 0x4430, 0x1220, 0x1e44, 0x380c, 0x0854,
- 0x4330, 0x1320, 0x1f44, 0x370c, 0x0754,
- 0x4230, 0x1420, 0x2044, 0x360c, 0x0654,
- 0x4234, 0x1520, 0x2144, 0x3610, 0x0658,
- 0x4334, 0x1620, 0x2244, 0x3710, 0x0858,
- 0x4434, 0x1720, 0x2344, 0x3810, 0x0a58,
- 0x0024, 0x1b10, 0x2734, 0x3c00, 0x0c48,
- 0x0124, 0x1c10, 0x2834, 0x3d00, 0x0d48,
- 0x0224, 0x1d10, 0x2934, 0x3e00, 0x0e48,
- 0x0324, 0x1d14, 0x2938, 0x3f00, 0x0f48,
- 0x0424, 0x1c14, 0x2838, 0x4000, 0x1048,
- 0x0524, 0x1b14, 0x2738, 0x4100, 0x1148,
- 0x0528, 0x1a14, 0x2638, 0x4104, 0x114c,
- 0x0428, 0x1914, 0x2538, 0x4004, 0x104c,
- 0x0328, 0x1814, 0x2438, 0x3f04, 0x0f4c,
- 0x0228, 0x1818, 0x243c, 0x3e04, 0x0e4c,
- 0x0128, 0x1918, 0x253c, 0x3d04, 0x0d4c,
- 0x0028, 0x1a18, 0x263c, 0x3c04, 0x0c4c,
- 0x002c, 0x1b18, 0x273c, 0x3c08, 0x0c50,
- 0x012c, 0x1c18, 0x283c, 0x3d08, 0x0d50,
- 0x022c, 0x1d18, 0x293c, 0x3e08, 0x0e50,
- 0x032c, 0x1d1c, 0x2940, 0x3f08, 0x0f50,
- 0x042c, 0x1c1c, 0x2840, 0x4008, 0x1050,
- 0x052c, 0x1b1c, 0x2740, 0x4108, 0x1150,
- 0x0530, 0x1a1c, 0x2640, 0x410c, 0x1154,
- 0x0430, 0x191c, 0x2540, 0x400c, 0x1054,
- 0x0330, 0x181c, 0x2440, 0x3f0c, 0x0f54,
- 0x0230, 0x1820, 0x2444, 0x3e0c, 0x0e54,
- 0x0130, 0x1920, 0x2544, 0x3d0c, 0x0d54,
- 0x0030, 0x1a20, 0x2644, 0x3c0c, 0x0c54,
- 0x0034, 0x1b20, 0x2744, 0x3c10, 0x0c58,
- 0x0134, 0x1c20, 0x2844, 0x3d10, 0x0e58,
- 0x0234, 0x1d20, 0x2944, 0x3e10, 0x1058,
- 0x0624, 0x2110, 0x2d34, 0x4200, 0x1248,
- 0x0724, 0x2210, 0x2e34, 0x4300, 0x1348,
- 0x0824, 0x2310, 0x2f34, 0x4400, 0x1448,
- 0x0924, 0x2314, 0x2f38, 0x4500, 0x1548,
- 0x0a24, 0x2214, 0x2e38, 0x4600, 0x1648,
- 0x0b24, 0x2114, 0x2d38, 0x4700, 0x1748,
- 0x0b28, 0x2014, 0x2c38, 0x4704, 0x174c,
- 0x0a28, 0x1f14, 0x2b38, 0x4604, 0x164c,
- 0x0928, 0x1e14, 0x2a38, 0x4504, 0x154c,
- 0x0828, 0x1e18, 0x2a3c, 0x4404, 0x144c,
- 0x0728, 0x1f18, 0x2b3c, 0x4304, 0x134c,
- 0x0628, 0x2018, 0x2c3c, 0x4204, 0x124c,
- 0x062c, 0x2118, 0x2d3c, 0x4208, 0x1250,
- 0x072c, 0x2218, 0x2e3c, 0x4308, 0x1350,
- 0x082c, 0x2318, 0x2f3c, 0x4408, 0x1450,
- 0x092c, 0x231c, 0x2f40, 0x4508, 0x1550,
- 0x0a2c, 0x221c, 0x2e40, 0x4608, 0x1650,
- 0x0b2c, 0x211c, 0x2d40, 0x4708, 0x1750,
- 0x0b30, 0x201c, 0x2c40, 0x470c, 0x1754,
- 0x0a30, 0x1f1c, 0x2b40, 0x460c, 0x1654,
- 0x0930, 0x1e1c, 0x2a40, 0x450c, 0x1554,
- 0x0830, 0x1e20, 0x2a44, 0x440c, 0x1454,
- 0x0730, 0x1f20, 0x2b44, 0x430c, 0x1354,
- 0x0630, 0x2020, 0x2c44, 0x420c, 0x1254,
- 0x0634, 0x2120, 0x2d44, 0x4210, 0x1258,
- 0x0734, 0x2220, 0x2e44, 0x4310, 0x1458,
- 0x0834, 0x2320, 0x2f44, 0x4410, 0x1658,
-};
-
-static const uint16_t dv_place_411[1350] = {
- 0x0c24, 0x2710, 0x3334, 0x0000, 0x1848,
- 0x0d24, 0x2810, 0x3434, 0x0100, 0x1948,
- 0x0e24, 0x2910, 0x3534, 0x0200, 0x1a48,
- 0x0f24, 0x2914, 0x3538, 0x0300, 0x1b48,
- 0x1024, 0x2814, 0x3438, 0x0400, 0x1c48,
- 0x1124, 0x2714, 0x3338, 0x0500, 0x1d48,
- 0x1128, 0x2614, 0x3238, 0x0504, 0x1d4c,
- 0x1028, 0x2514, 0x3138, 0x0404, 0x1c4c,
- 0x0f28, 0x2414, 0x3038, 0x0304, 0x1b4c,
- 0x0e28, 0x2418, 0x303c, 0x0204, 0x1a4c,
- 0x0d28, 0x2518, 0x313c, 0x0104, 0x194c,
- 0x0c28, 0x2618, 0x323c, 0x0004, 0x184c,
- 0x0c2c, 0x2718, 0x333c, 0x0008, 0x1850,
- 0x0d2c, 0x2818, 0x343c, 0x0108, 0x1950,
- 0x0e2c, 0x2918, 0x353c, 0x0208, 0x1a50,
- 0x0f2c, 0x291c, 0x3540, 0x0308, 0x1b50,
- 0x102c, 0x281c, 0x3440, 0x0408, 0x1c50,
- 0x112c, 0x271c, 0x3340, 0x0508, 0x1d50,
- 0x1130, 0x261c, 0x3240, 0x050c, 0x1d54,
- 0x1030, 0x251c, 0x3140, 0x040c, 0x1c54,
- 0x0f30, 0x241c, 0x3040, 0x030c, 0x1b54,
- 0x0e30, 0x2420, 0x3044, 0x020c, 0x1a54,
- 0x0d30, 0x2520, 0x3144, 0x010c, 0x1954,
- 0x0c30, 0x2620, 0x3244, 0x000c, 0x1854,
- 0x0c34, 0x2720, 0x3344, 0x0010, 0x1858,
- 0x0d34, 0x2820, 0x3444, 0x0110, 0x1a58,
- 0x0e34, 0x2920, 0x3544, 0x0210, 0x1c58,
- 0x1224, 0x2d10, 0x3934, 0x0600, 0x1e48,
- 0x1324, 0x2e10, 0x3a34, 0x0700, 0x1f48,
- 0x1424, 0x2f10, 0x3b34, 0x0800, 0x2048,
- 0x1524, 0x2f14, 0x3b38, 0x0900, 0x2148,
- 0x1624, 0x2e14, 0x3a38, 0x0a00, 0x2248,
- 0x1724, 0x2d14, 0x3938, 0x0b00, 0x2348,
- 0x1728, 0x2c14, 0x3838, 0x0b04, 0x234c,
- 0x1628, 0x2b14, 0x3738, 0x0a04, 0x224c,
- 0x1528, 0x2a14, 0x3638, 0x0904, 0x214c,
- 0x1428, 0x2a18, 0x363c, 0x0804, 0x204c,
- 0x1328, 0x2b18, 0x373c, 0x0704, 0x1f4c,
- 0x1228, 0x2c18, 0x383c, 0x0604, 0x1e4c,
- 0x122c, 0x2d18, 0x393c, 0x0608, 0x1e50,
- 0x132c, 0x2e18, 0x3a3c, 0x0708, 0x1f50,
- 0x142c, 0x2f18, 0x3b3c, 0x0808, 0x2050,
- 0x152c, 0x2f1c, 0x3b40, 0x0908, 0x2150,
- 0x162c, 0x2e1c, 0x3a40, 0x0a08, 0x2250,
- 0x172c, 0x2d1c, 0x3940, 0x0b08, 0x2350,
- 0x1730, 0x2c1c, 0x3840, 0x0b0c, 0x2354,
- 0x1630, 0x2b1c, 0x3740, 0x0a0c, 0x2254,
- 0x1530, 0x2a1c, 0x3640, 0x090c, 0x2154,
- 0x1430, 0x2a20, 0x3644, 0x080c, 0x2054,
- 0x1330, 0x2b20, 0x3744, 0x070c, 0x1f54,
- 0x1230, 0x2c20, 0x3844, 0x060c, 0x1e54,
- 0x1234, 0x2d20, 0x3944, 0x0610, 0x1e58,
- 0x1334, 0x2e20, 0x3a44, 0x0710, 0x2058,
- 0x1434, 0x2f20, 0x3b44, 0x0810, 0x2258,
- 0x1824, 0x3310, 0x0334, 0x0c00, 0x2448,
- 0x1924, 0x3410, 0x0434, 0x0d00, 0x2548,
- 0x1a24, 0x3510, 0x0534, 0x0e00, 0x2648,
- 0x1b24, 0x3514, 0x0538, 0x0f00, 0x2748,
- 0x1c24, 0x3414, 0x0438, 0x1000, 0x2848,
- 0x1d24, 0x3314, 0x0338, 0x1100, 0x2948,
- 0x1d28, 0x3214, 0x0238, 0x1104, 0x294c,
- 0x1c28, 0x3114, 0x0138, 0x1004, 0x284c,
- 0x1b28, 0x3014, 0x0038, 0x0f04, 0x274c,
- 0x1a28, 0x3018, 0x003c, 0x0e04, 0x264c,
- 0x1928, 0x3118, 0x013c, 0x0d04, 0x254c,
- 0x1828, 0x3218, 0x023c, 0x0c04, 0x244c,
- 0x182c, 0x3318, 0x033c, 0x0c08, 0x2450,
- 0x192c, 0x3418, 0x043c, 0x0d08, 0x2550,
- 0x1a2c, 0x3518, 0x053c, 0x0e08, 0x2650,
- 0x1b2c, 0x351c, 0x0540, 0x0f08, 0x2750,
- 0x1c2c, 0x341c, 0x0440, 0x1008, 0x2850,
- 0x1d2c, 0x331c, 0x0340, 0x1108, 0x2950,
- 0x1d30, 0x321c, 0x0240, 0x110c, 0x2954,
- 0x1c30, 0x311c, 0x0140, 0x100c, 0x2854,
- 0x1b30, 0x301c, 0x0040, 0x0f0c, 0x2754,
- 0x1a30, 0x3020, 0x0044, 0x0e0c, 0x2654,
- 0x1930, 0x3120, 0x0144, 0x0d0c, 0x2554,
- 0x1830, 0x3220, 0x0244, 0x0c0c, 0x2454,
- 0x1834, 0x3320, 0x0344, 0x0c10, 0x2458,
- 0x1934, 0x3420, 0x0444, 0x0d10, 0x2658,
- 0x1a34, 0x3520, 0x0544, 0x0e10, 0x2858,
- 0x1e24, 0x3910, 0x0934, 0x1200, 0x2a48,
- 0x1f24, 0x3a10, 0x0a34, 0x1300, 0x2b48,
- 0x2024, 0x3b10, 0x0b34, 0x1400, 0x2c48,
- 0x2124, 0x3b14, 0x0b38, 0x1500, 0x2d48,
- 0x2224, 0x3a14, 0x0a38, 0x1600, 0x2e48,
- 0x2324, 0x3914, 0x0938, 0x1700, 0x2f48,
- 0x2328, 0x3814, 0x0838, 0x1704, 0x2f4c,
- 0x2228, 0x3714, 0x0738, 0x1604, 0x2e4c,
- 0x2128, 0x3614, 0x0638, 0x1504, 0x2d4c,
- 0x2028, 0x3618, 0x063c, 0x1404, 0x2c4c,
- 0x1f28, 0x3718, 0x073c, 0x1304, 0x2b4c,
- 0x1e28, 0x3818, 0x083c, 0x1204, 0x2a4c,
- 0x1e2c, 0x3918, 0x093c, 0x1208, 0x2a50,
- 0x1f2c, 0x3a18, 0x0a3c, 0x1308, 0x2b50,
- 0x202c, 0x3b18, 0x0b3c, 0x1408, 0x2c50,
- 0x212c, 0x3b1c, 0x0b40, 0x1508, 0x2d50,
- 0x222c, 0x3a1c, 0x0a40, 0x1608, 0x2e50,
- 0x232c, 0x391c, 0x0940, 0x1708, 0x2f50,
- 0x2330, 0x381c, 0x0840, 0x170c, 0x2f54,
- 0x2230, 0x371c, 0x0740, 0x160c, 0x2e54,
- 0x2130, 0x361c, 0x0640, 0x150c, 0x2d54,
- 0x2030, 0x3620, 0x0644, 0x140c, 0x2c54,
- 0x1f30, 0x3720, 0x0744, 0x130c, 0x2b54,
- 0x1e30, 0x3820, 0x0844, 0x120c, 0x2a54,
- 0x1e34, 0x3920, 0x0944, 0x1210, 0x2a58,
- 0x1f34, 0x3a20, 0x0a44, 0x1310, 0x2c58,
- 0x2034, 0x3b20, 0x0b44, 0x1410, 0x2e58,
- 0x2424, 0x0310, 0x0f34, 0x1800, 0x3048,
- 0x2524, 0x0410, 0x1034, 0x1900, 0x3148,
- 0x2624, 0x0510, 0x1134, 0x1a00, 0x3248,
- 0x2724, 0x0514, 0x1138, 0x1b00, 0x3348,
- 0x2824, 0x0414, 0x1038, 0x1c00, 0x3448,
- 0x2924, 0x0314, 0x0f38, 0x1d00, 0x3548,
- 0x2928, 0x0214, 0x0e38, 0x1d04, 0x354c,
- 0x2828, 0x0114, 0x0d38, 0x1c04, 0x344c,
- 0x2728, 0x0014, 0x0c38, 0x1b04, 0x334c,
- 0x2628, 0x0018, 0x0c3c, 0x1a04, 0x324c,
- 0x2528, 0x0118, 0x0d3c, 0x1904, 0x314c,
- 0x2428, 0x0218, 0x0e3c, 0x1804, 0x304c,
- 0x242c, 0x0318, 0x0f3c, 0x1808, 0x3050,
- 0x252c, 0x0418, 0x103c, 0x1908, 0x3150,
- 0x262c, 0x0518, 0x113c, 0x1a08, 0x3250,
- 0x272c, 0x051c, 0x1140, 0x1b08, 0x3350,
- 0x282c, 0x041c, 0x1040, 0x1c08, 0x3450,
- 0x292c, 0x031c, 0x0f40, 0x1d08, 0x3550,
- 0x2930, 0x021c, 0x0e40, 0x1d0c, 0x3554,
- 0x2830, 0x011c, 0x0d40, 0x1c0c, 0x3454,
- 0x2730, 0x001c, 0x0c40, 0x1b0c, 0x3354,
- 0x2630, 0x0020, 0x0c44, 0x1a0c, 0x3254,
- 0x2530, 0x0120, 0x0d44, 0x190c, 0x3154,
- 0x2430, 0x0220, 0x0e44, 0x180c, 0x3054,
- 0x2434, 0x0320, 0x0f44, 0x1810, 0x3058,
- 0x2534, 0x0420, 0x1044, 0x1910, 0x3258,
- 0x2634, 0x0520, 0x1144, 0x1a10, 0x3458,
- 0x2a24, 0x0910, 0x1534, 0x1e00, 0x3648,
- 0x2b24, 0x0a10, 0x1634, 0x1f00, 0x3748,
- 0x2c24, 0x0b10, 0x1734, 0x2000, 0x3848,
- 0x2d24, 0x0b14, 0x1738, 0x2100, 0x3948,
- 0x2e24, 0x0a14, 0x1638, 0x2200, 0x3a48,
- 0x2f24, 0x0914, 0x1538, 0x2300, 0x3b48,
- 0x2f28, 0x0814, 0x1438, 0x2304, 0x3b4c,
- 0x2e28, 0x0714, 0x1338, 0x2204, 0x3a4c,
- 0x2d28, 0x0614, 0x1238, 0x2104, 0x394c,
- 0x2c28, 0x0618, 0x123c, 0x2004, 0x384c,
- 0x2b28, 0x0718, 0x133c, 0x1f04, 0x374c,
- 0x2a28, 0x0818, 0x143c, 0x1e04, 0x364c,
- 0x2a2c, 0x0918, 0x153c, 0x1e08, 0x3650,
- 0x2b2c, 0x0a18, 0x163c, 0x1f08, 0x3750,
- 0x2c2c, 0x0b18, 0x173c, 0x2008, 0x3850,
- 0x2d2c, 0x0b1c, 0x1740, 0x2108, 0x3950,
- 0x2e2c, 0x0a1c, 0x1640, 0x2208, 0x3a50,
- 0x2f2c, 0x091c, 0x1540, 0x2308, 0x3b50,
- 0x2f30, 0x081c, 0x1440, 0x230c, 0x3b54,
- 0x2e30, 0x071c, 0x1340, 0x220c, 0x3a54,
- 0x2d30, 0x061c, 0x1240, 0x210c, 0x3954,
- 0x2c30, 0x0620, 0x1244, 0x200c, 0x3854,
- 0x2b30, 0x0720, 0x1344, 0x1f0c, 0x3754,
- 0x2a30, 0x0820, 0x1444, 0x1e0c, 0x3654,
- 0x2a34, 0x0920, 0x1544, 0x1e10, 0x3658,
- 0x2b34, 0x0a20, 0x1644, 0x1f10, 0x3858,
- 0x2c34, 0x0b20, 0x1744, 0x2010, 0x3a58,
- 0x3024, 0x0f10, 0x1b34, 0x2400, 0x0048,
- 0x3124, 0x1010, 0x1c34, 0x2500, 0x0148,
- 0x3224, 0x1110, 0x1d34, 0x2600, 0x0248,
- 0x3324, 0x1114, 0x1d38, 0x2700, 0x0348,
- 0x3424, 0x1014, 0x1c38, 0x2800, 0x0448,
- 0x3524, 0x0f14, 0x1b38, 0x2900, 0x0548,
- 0x3528, 0x0e14, 0x1a38, 0x2904, 0x054c,
- 0x3428, 0x0d14, 0x1938, 0x2804, 0x044c,
- 0x3328, 0x0c14, 0x1838, 0x2704, 0x034c,
- 0x3228, 0x0c18, 0x183c, 0x2604, 0x024c,
- 0x3128, 0x0d18, 0x193c, 0x2504, 0x014c,
- 0x3028, 0x0e18, 0x1a3c, 0x2404, 0x004c,
- 0x302c, 0x0f18, 0x1b3c, 0x2408, 0x0050,
- 0x312c, 0x1018, 0x1c3c, 0x2508, 0x0150,
- 0x322c, 0x1118, 0x1d3c, 0x2608, 0x0250,
- 0x332c, 0x111c, 0x1d40, 0x2708, 0x0350,
- 0x342c, 0x101c, 0x1c40, 0x2808, 0x0450,
- 0x352c, 0x0f1c, 0x1b40, 0x2908, 0x0550,
- 0x3530, 0x0e1c, 0x1a40, 0x290c, 0x0554,
- 0x3430, 0x0d1c, 0x1940, 0x280c, 0x0454,
- 0x3330, 0x0c1c, 0x1840, 0x270c, 0x0354,
- 0x3230, 0x0c20, 0x1844, 0x260c, 0x0254,
- 0x3130, 0x0d20, 0x1944, 0x250c, 0x0154,
- 0x3030, 0x0e20, 0x1a44, 0x240c, 0x0054,
- 0x3034, 0x0f20, 0x1b44, 0x2410, 0x0058,
- 0x3134, 0x1020, 0x1c44, 0x2510, 0x0258,
- 0x3234, 0x1120, 0x1d44, 0x2610, 0x0458,
- 0x3624, 0x1510, 0x2134, 0x2a00, 0x0648,
- 0x3724, 0x1610, 0x2234, 0x2b00, 0x0748,
- 0x3824, 0x1710, 0x2334, 0x2c00, 0x0848,
- 0x3924, 0x1714, 0x2338, 0x2d00, 0x0948,
- 0x3a24, 0x1614, 0x2238, 0x2e00, 0x0a48,
- 0x3b24, 0x1514, 0x2138, 0x2f00, 0x0b48,
- 0x3b28, 0x1414, 0x2038, 0x2f04, 0x0b4c,
- 0x3a28, 0x1314, 0x1f38, 0x2e04, 0x0a4c,
- 0x3928, 0x1214, 0x1e38, 0x2d04, 0x094c,
- 0x3828, 0x1218, 0x1e3c, 0x2c04, 0x084c,
- 0x3728, 0x1318, 0x1f3c, 0x2b04, 0x074c,
- 0x3628, 0x1418, 0x203c, 0x2a04, 0x064c,
- 0x362c, 0x1518, 0x213c, 0x2a08, 0x0650,
- 0x372c, 0x1618, 0x223c, 0x2b08, 0x0750,
- 0x382c, 0x1718, 0x233c, 0x2c08, 0x0850,
- 0x392c, 0x171c, 0x2340, 0x2d08, 0x0950,
- 0x3a2c, 0x161c, 0x2240, 0x2e08, 0x0a50,
- 0x3b2c, 0x151c, 0x2140, 0x2f08, 0x0b50,
- 0x3b30, 0x141c, 0x2040, 0x2f0c, 0x0b54,
- 0x3a30, 0x131c, 0x1f40, 0x2e0c, 0x0a54,
- 0x3930, 0x121c, 0x1e40, 0x2d0c, 0x0954,
- 0x3830, 0x1220, 0x1e44, 0x2c0c, 0x0854,
- 0x3730, 0x1320, 0x1f44, 0x2b0c, 0x0754,
- 0x3630, 0x1420, 0x2044, 0x2a0c, 0x0654,
- 0x3634, 0x1520, 0x2144, 0x2a10, 0x0658,
- 0x3734, 0x1620, 0x2244, 0x2b10, 0x0858,
- 0x3834, 0x1720, 0x2344, 0x2c10, 0x0a58,
- 0x0024, 0x1b10, 0x2734, 0x3000, 0x0c48,
- 0x0124, 0x1c10, 0x2834, 0x3100, 0x0d48,
- 0x0224, 0x1d10, 0x2934, 0x3200, 0x0e48,
- 0x0324, 0x1d14, 0x2938, 0x3300, 0x0f48,
- 0x0424, 0x1c14, 0x2838, 0x3400, 0x1048,
- 0x0524, 0x1b14, 0x2738, 0x3500, 0x1148,
- 0x0528, 0x1a14, 0x2638, 0x3504, 0x114c,
- 0x0428, 0x1914, 0x2538, 0x3404, 0x104c,
- 0x0328, 0x1814, 0x2438, 0x3304, 0x0f4c,
- 0x0228, 0x1818, 0x243c, 0x3204, 0x0e4c,
- 0x0128, 0x1918, 0x253c, 0x3104, 0x0d4c,
- 0x0028, 0x1a18, 0x263c, 0x3004, 0x0c4c,
- 0x002c, 0x1b18, 0x273c, 0x3008, 0x0c50,
- 0x012c, 0x1c18, 0x283c, 0x3108, 0x0d50,
- 0x022c, 0x1d18, 0x293c, 0x3208, 0x0e50,
- 0x032c, 0x1d1c, 0x2940, 0x3308, 0x0f50,
- 0x042c, 0x1c1c, 0x2840, 0x3408, 0x1050,
- 0x052c, 0x1b1c, 0x2740, 0x3508, 0x1150,
- 0x0530, 0x1a1c, 0x2640, 0x350c, 0x1154,
- 0x0430, 0x191c, 0x2540, 0x340c, 0x1054,
- 0x0330, 0x181c, 0x2440, 0x330c, 0x0f54,
- 0x0230, 0x1820, 0x2444, 0x320c, 0x0e54,
- 0x0130, 0x1920, 0x2544, 0x310c, 0x0d54,
- 0x0030, 0x1a20, 0x2644, 0x300c, 0x0c54,
- 0x0034, 0x1b20, 0x2744, 0x3010, 0x0c58,
- 0x0134, 0x1c20, 0x2844, 0x3110, 0x0e58,
- 0x0234, 0x1d20, 0x2944, 0x3210, 0x1058,
- 0x0624, 0x2110, 0x2d34, 0x3600, 0x1248,
- 0x0724, 0x2210, 0x2e34, 0x3700, 0x1348,
- 0x0824, 0x2310, 0x2f34, 0x3800, 0x1448,
- 0x0924, 0x2314, 0x2f38, 0x3900, 0x1548,
- 0x0a24, 0x2214, 0x2e38, 0x3a00, 0x1648,
- 0x0b24, 0x2114, 0x2d38, 0x3b00, 0x1748,
- 0x0b28, 0x2014, 0x2c38, 0x3b04, 0x174c,
- 0x0a28, 0x1f14, 0x2b38, 0x3a04, 0x164c,
- 0x0928, 0x1e14, 0x2a38, 0x3904, 0x154c,
- 0x0828, 0x1e18, 0x2a3c, 0x3804, 0x144c,
- 0x0728, 0x1f18, 0x2b3c, 0x3704, 0x134c,
- 0x0628, 0x2018, 0x2c3c, 0x3604, 0x124c,
- 0x062c, 0x2118, 0x2d3c, 0x3608, 0x1250,
- 0x072c, 0x2218, 0x2e3c, 0x3708, 0x1350,
- 0x082c, 0x2318, 0x2f3c, 0x3808, 0x1450,
- 0x092c, 0x231c, 0x2f40, 0x3908, 0x1550,
- 0x0a2c, 0x221c, 0x2e40, 0x3a08, 0x1650,
- 0x0b2c, 0x211c, 0x2d40, 0x3b08, 0x1750,
- 0x0b30, 0x201c, 0x2c40, 0x3b0c, 0x1754,
- 0x0a30, 0x1f1c, 0x2b40, 0x3a0c, 0x1654,
- 0x0930, 0x1e1c, 0x2a40, 0x390c, 0x1554,
- 0x0830, 0x1e20, 0x2a44, 0x380c, 0x1454,
- 0x0730, 0x1f20, 0x2b44, 0x370c, 0x1354,
- 0x0630, 0x2020, 0x2c44, 0x360c, 0x1254,
- 0x0634, 0x2120, 0x2d44, 0x3610, 0x1258,
- 0x0734, 0x2220, 0x2e44, 0x3710, 0x1458,
- 0x0834, 0x2320, 0x2f44, 0x3810, 0x1658,
-};
-
-/* 2 channels per frame, 10 DIF sequences per channel,
-   27 video segments per DIF sequence, 5 macroblocks per video segment */
-static const uint16_t dv_place_422_525[2*10*27*5] = {
- 0x0c24, 0x2412, 0x3036, 0x0000, 0x1848,
- 0x0d24, 0x2512, 0x3136, 0x0100, 0x1948,
- 0x0e24, 0x2612, 0x3236, 0x0200, 0x1a48,
- 0x0e26, 0x2614, 0x3238, 0x0202, 0x1a4a,
- 0x0d26, 0x2514, 0x3138, 0x0102, 0x194a,
- 0x0c26, 0x2414, 0x3038, 0x0002, 0x184a,
- 0x0c28, 0x2416, 0x303a, 0x0004, 0x184c,
- 0x0d28, 0x2516, 0x313a, 0x0104, 0x194c,
- 0x0e28, 0x2616, 0x323a, 0x0204, 0x1a4c,
- 0x0e2a, 0x2618, 0x323c, 0x0206, 0x1a4e,
- 0x0d2a, 0x2518, 0x313c, 0x0106, 0x194e,
- 0x0c2a, 0x2418, 0x303c, 0x0006, 0x184e,
- 0x0c2c, 0x241a, 0x303e, 0x0008, 0x1850,
- 0x0d2c, 0x251a, 0x313e, 0x0108, 0x1950,
- 0x0e2c, 0x261a, 0x323e, 0x0208, 0x1a50,
- 0x0e2e, 0x261c, 0x3240, 0x020a, 0x1a52,
- 0x0d2e, 0x251c, 0x3140, 0x010a, 0x1952,
- 0x0c2e, 0x241c, 0x3040, 0x000a, 0x1852,
- 0x0c30, 0x241e, 0x3042, 0x000c, 0x1854,
- 0x0d30, 0x251e, 0x3142, 0x010c, 0x1954,
- 0x0e30, 0x261e, 0x3242, 0x020c, 0x1a54,
- 0x0e32, 0x2620, 0x3244, 0x020e, 0x1a56,
- 0x0d32, 0x2520, 0x3144, 0x010e, 0x1956,
- 0x0c32, 0x2420, 0x3044, 0x000e, 0x1856,
- 0x0c34, 0x2422, 0x3046, 0x0010, 0x1858,
- 0x0d34, 0x2522, 0x3146, 0x0110, 0x1958,
- 0x0e34, 0x2622, 0x3246, 0x0210, 0x1a58,
- 0x1224, 0x2a12, 0x3636, 0x0600, 0x1e48,
- 0x1324, 0x2b12, 0x3736, 0x0700, 0x1f48,
- 0x1424, 0x2c12, 0x3836, 0x0800, 0x2048,
- 0x1426, 0x2c14, 0x3838, 0x0802, 0x204a,
- 0x1326, 0x2b14, 0x3738, 0x0702, 0x1f4a,
- 0x1226, 0x2a14, 0x3638, 0x0602, 0x1e4a,
- 0x1228, 0x2a16, 0x363a, 0x0604, 0x1e4c,
- 0x1328, 0x2b16, 0x373a, 0x0704, 0x1f4c,
- 0x1428, 0x2c16, 0x383a, 0x0804, 0x204c,
- 0x142a, 0x2c18, 0x383c, 0x0806, 0x204e,
- 0x132a, 0x2b18, 0x373c, 0x0706, 0x1f4e,
- 0x122a, 0x2a18, 0x363c, 0x0606, 0x1e4e,
- 0x122c, 0x2a1a, 0x363e, 0x0608, 0x1e50,
- 0x132c, 0x2b1a, 0x373e, 0x0708, 0x1f50,
- 0x142c, 0x2c1a, 0x383e, 0x0808, 0x2050,
- 0x142e, 0x2c1c, 0x3840, 0x080a, 0x2052,
- 0x132e, 0x2b1c, 0x3740, 0x070a, 0x1f52,
- 0x122e, 0x2a1c, 0x3640, 0x060a, 0x1e52,
- 0x1230, 0x2a1e, 0x3642, 0x060c, 0x1e54,
- 0x1330, 0x2b1e, 0x3742, 0x070c, 0x1f54,
- 0x1430, 0x2c1e, 0x3842, 0x080c, 0x2054,
- 0x1432, 0x2c20, 0x3844, 0x080e, 0x2056,
- 0x1332, 0x2b20, 0x3744, 0x070e, 0x1f56,
- 0x1232, 0x2a20, 0x3644, 0x060e, 0x1e56,
- 0x1234, 0x2a22, 0x3646, 0x0610, 0x1e58,
- 0x1334, 0x2b22, 0x3746, 0x0710, 0x1f58,
- 0x1434, 0x2c22, 0x3846, 0x0810, 0x2058,
- 0x1824, 0x3012, 0x0036, 0x0c00, 0x2448,
- 0x1924, 0x3112, 0x0136, 0x0d00, 0x2548,
- 0x1a24, 0x3212, 0x0236, 0x0e00, 0x2648,
- 0x1a26, 0x3214, 0x0238, 0x0e02, 0x264a,
- 0x1926, 0x3114, 0x0138, 0x0d02, 0x254a,
- 0x1826, 0x3014, 0x0038, 0x0c02, 0x244a,
- 0x1828, 0x3016, 0x003a, 0x0c04, 0x244c,
- 0x1928, 0x3116, 0x013a, 0x0d04, 0x254c,
- 0x1a28, 0x3216, 0x023a, 0x0e04, 0x264c,
- 0x1a2a, 0x3218, 0x023c, 0x0e06, 0x264e,
- 0x192a, 0x3118, 0x013c, 0x0d06, 0x254e,
- 0x182a, 0x3018, 0x003c, 0x0c06, 0x244e,
- 0x182c, 0x301a, 0x003e, 0x0c08, 0x2450,
- 0x192c, 0x311a, 0x013e, 0x0d08, 0x2550,
- 0x1a2c, 0x321a, 0x023e, 0x0e08, 0x2650,
- 0x1a2e, 0x321c, 0x0240, 0x0e0a, 0x2652,
- 0x192e, 0x311c, 0x0140, 0x0d0a, 0x2552,
- 0x182e, 0x301c, 0x0040, 0x0c0a, 0x2452,
- 0x1830, 0x301e, 0x0042, 0x0c0c, 0x2454,
- 0x1930, 0x311e, 0x0142, 0x0d0c, 0x2554,
- 0x1a30, 0x321e, 0x0242, 0x0e0c, 0x2654,
- 0x1a32, 0x3220, 0x0244, 0x0e0e, 0x2656,
- 0x1932, 0x3120, 0x0144, 0x0d0e, 0x2556,
- 0x1832, 0x3020, 0x0044, 0x0c0e, 0x2456,
- 0x1834, 0x3022, 0x0046, 0x0c10, 0x2458,
- 0x1934, 0x3122, 0x0146, 0x0d10, 0x2558,
- 0x1a34, 0x3222, 0x0246, 0x0e10, 0x2658,
- 0x1e24, 0x3612, 0x0636, 0x1200, 0x2a48,
- 0x1f24, 0x3712, 0x0736, 0x1300, 0x2b48,
- 0x2024, 0x3812, 0x0836, 0x1400, 0x2c48,
- 0x2026, 0x3814, 0x0838, 0x1402, 0x2c4a,
- 0x1f26, 0x3714, 0x0738, 0x1302, 0x2b4a,
- 0x1e26, 0x3614, 0x0638, 0x1202, 0x2a4a,
- 0x1e28, 0x3616, 0x063a, 0x1204, 0x2a4c,
- 0x1f28, 0x3716, 0x073a, 0x1304, 0x2b4c,
- 0x2028, 0x3816, 0x083a, 0x1404, 0x2c4c,
- 0x202a, 0x3818, 0x083c, 0x1406, 0x2c4e,
- 0x1f2a, 0x3718, 0x073c, 0x1306, 0x2b4e,
- 0x1e2a, 0x3618, 0x063c, 0x1206, 0x2a4e,
- 0x1e2c, 0x361a, 0x063e, 0x1208, 0x2a50,
- 0x1f2c, 0x371a, 0x073e, 0x1308, 0x2b50,
- 0x202c, 0x381a, 0x083e, 0x1408, 0x2c50,
- 0x202e, 0x381c, 0x0840, 0x140a, 0x2c52,
- 0x1f2e, 0x371c, 0x0740, 0x130a, 0x2b52,
- 0x1e2e, 0x361c, 0x0640, 0x120a, 0x2a52,
- 0x1e30, 0x361e, 0x0642, 0x120c, 0x2a54,
- 0x1f30, 0x371e, 0x0742, 0x130c, 0x2b54,
- 0x2030, 0x381e, 0x0842, 0x140c, 0x2c54,
- 0x2032, 0x3820, 0x0844, 0x140e, 0x2c56,
- 0x1f32, 0x3720, 0x0744, 0x130e, 0x2b56,
- 0x1e32, 0x3620, 0x0644, 0x120e, 0x2a56,
- 0x1e34, 0x3622, 0x0646, 0x1210, 0x2a58,
- 0x1f34, 0x3722, 0x0746, 0x1310, 0x2b58,
- 0x2034, 0x3822, 0x0846, 0x1410, 0x2c58,
- 0x2424, 0x0012, 0x0c36, 0x1800, 0x3048,
- 0x2524, 0x0112, 0x0d36, 0x1900, 0x3148,
- 0x2624, 0x0212, 0x0e36, 0x1a00, 0x3248,
- 0x2626, 0x0214, 0x0e38, 0x1a02, 0x324a,
- 0x2526, 0x0114, 0x0d38, 0x1902, 0x314a,
- 0x2426, 0x0014, 0x0c38, 0x1802, 0x304a,
- 0x2428, 0x0016, 0x0c3a, 0x1804, 0x304c,
- 0x2528, 0x0116, 0x0d3a, 0x1904, 0x314c,
- 0x2628, 0x0216, 0x0e3a, 0x1a04, 0x324c,
- 0x262a, 0x0218, 0x0e3c, 0x1a06, 0x324e,
- 0x252a, 0x0118, 0x0d3c, 0x1906, 0x314e,
- 0x242a, 0x0018, 0x0c3c, 0x1806, 0x304e,
- 0x242c, 0x001a, 0x0c3e, 0x1808, 0x3050,
- 0x252c, 0x011a, 0x0d3e, 0x1908, 0x3150,
- 0x262c, 0x021a, 0x0e3e, 0x1a08, 0x3250,
- 0x262e, 0x021c, 0x0e40, 0x1a0a, 0x3252,
- 0x252e, 0x011c, 0x0d40, 0x190a, 0x3152,
- 0x242e, 0x001c, 0x0c40, 0x180a, 0x3052,
- 0x2430, 0x001e, 0x0c42, 0x180c, 0x3054,
- 0x2530, 0x011e, 0x0d42, 0x190c, 0x3154,
- 0x2630, 0x021e, 0x0e42, 0x1a0c, 0x3254,
- 0x2632, 0x0220, 0x0e44, 0x1a0e, 0x3256,
- 0x2532, 0x0120, 0x0d44, 0x190e, 0x3156,
- 0x2432, 0x0020, 0x0c44, 0x180e, 0x3056,
- 0x2434, 0x0022, 0x0c46, 0x1810, 0x3058,
- 0x2534, 0x0122, 0x0d46, 0x1910, 0x3158,
- 0x2634, 0x0222, 0x0e46, 0x1a10, 0x3258,
- 0x2a24, 0x0612, 0x1236, 0x1e00, 0x3648,
- 0x2b24, 0x0712, 0x1336, 0x1f00, 0x3748,
- 0x2c24, 0x0812, 0x1436, 0x2000, 0x3848,
- 0x2c26, 0x0814, 0x1438, 0x2002, 0x384a,
- 0x2b26, 0x0714, 0x1338, 0x1f02, 0x374a,
- 0x2a26, 0x0614, 0x1238, 0x1e02, 0x364a,
- 0x2a28, 0x0616, 0x123a, 0x1e04, 0x364c,
- 0x2b28, 0x0716, 0x133a, 0x1f04, 0x374c,
- 0x2c28, 0x0816, 0x143a, 0x2004, 0x384c,
- 0x2c2a, 0x0818, 0x143c, 0x2006, 0x384e,
- 0x2b2a, 0x0718, 0x133c, 0x1f06, 0x374e,
- 0x2a2a, 0x0618, 0x123c, 0x1e06, 0x364e,
- 0x2a2c, 0x061a, 0x123e, 0x1e08, 0x3650,
- 0x2b2c, 0x071a, 0x133e, 0x1f08, 0x3750,
- 0x2c2c, 0x081a, 0x143e, 0x2008, 0x3850,
- 0x2c2e, 0x081c, 0x1440, 0x200a, 0x3852,
- 0x2b2e, 0x071c, 0x1340, 0x1f0a, 0x3752,
- 0x2a2e, 0x061c, 0x1240, 0x1e0a, 0x3652,
- 0x2a30, 0x061e, 0x1242, 0x1e0c, 0x3654,
- 0x2b30, 0x071e, 0x1342, 0x1f0c, 0x3754,
- 0x2c30, 0x081e, 0x1442, 0x200c, 0x3854,
- 0x2c32, 0x0820, 0x1444, 0x200e, 0x3856,
- 0x2b32, 0x0720, 0x1344, 0x1f0e, 0x3756,
- 0x2a32, 0x0620, 0x1244, 0x1e0e, 0x3656,
- 0x2a34, 0x0622, 0x1246, 0x1e10, 0x3658,
- 0x2b34, 0x0722, 0x1346, 0x1f10, 0x3758,
- 0x2c34, 0x0822, 0x1446, 0x2010, 0x3858,
- 0x3024, 0x0c12, 0x1836, 0x2400, 0x0048,
- 0x3124, 0x0d12, 0x1936, 0x2500, 0x0148,
- 0x3224, 0x0e12, 0x1a36, 0x2600, 0x0248,
- 0x3226, 0x0e14, 0x1a38, 0x2602, 0x024a,
- 0x3126, 0x0d14, 0x1938, 0x2502, 0x014a,
- 0x3026, 0x0c14, 0x1838, 0x2402, 0x004a,
- 0x3028, 0x0c16, 0x183a, 0x2404, 0x004c,
- 0x3128, 0x0d16, 0x193a, 0x2504, 0x014c,
- 0x3228, 0x0e16, 0x1a3a, 0x2604, 0x024c,
- 0x322a, 0x0e18, 0x1a3c, 0x2606, 0x024e,
- 0x312a, 0x0d18, 0x193c, 0x2506, 0x014e,
- 0x302a, 0x0c18, 0x183c, 0x2406, 0x004e,
- 0x302c, 0x0c1a, 0x183e, 0x2408, 0x0050,
- 0x312c, 0x0d1a, 0x193e, 0x2508, 0x0150,
- 0x322c, 0x0e1a, 0x1a3e, 0x2608, 0x0250,
- 0x322e, 0x0e1c, 0x1a40, 0x260a, 0x0252,
- 0x312e, 0x0d1c, 0x1940, 0x250a, 0x0152,
- 0x302e, 0x0c1c, 0x1840, 0x240a, 0x0052,
- 0x3030, 0x0c1e, 0x1842, 0x240c, 0x0054,
- 0x3130, 0x0d1e, 0x1942, 0x250c, 0x0154,
- 0x3230, 0x0e1e, 0x1a42, 0x260c, 0x0254,
- 0x3232, 0x0e20, 0x1a44, 0x260e, 0x0256,
- 0x3132, 0x0d20, 0x1944, 0x250e, 0x0156,
- 0x3032, 0x0c20, 0x1844, 0x240e, 0x0056,
- 0x3034, 0x0c22, 0x1846, 0x2410, 0x0058,
- 0x3134, 0x0d22, 0x1946, 0x2510, 0x0158,
- 0x3234, 0x0e22, 0x1a46, 0x2610, 0x0258,
- 0x3624, 0x1212, 0x1e36, 0x2a00, 0x0648,
- 0x3724, 0x1312, 0x1f36, 0x2b00, 0x0748,
- 0x3824, 0x1412, 0x2036, 0x2c00, 0x0848,
- 0x3826, 0x1414, 0x2038, 0x2c02, 0x084a,
- 0x3726, 0x1314, 0x1f38, 0x2b02, 0x074a,
- 0x3626, 0x1214, 0x1e38, 0x2a02, 0x064a,
- 0x3628, 0x1216, 0x1e3a, 0x2a04, 0x064c,
- 0x3728, 0x1316, 0x1f3a, 0x2b04, 0x074c,
- 0x3828, 0x1416, 0x203a, 0x2c04, 0x084c,
- 0x382a, 0x1418, 0x203c, 0x2c06, 0x084e,
- 0x372a, 0x1318, 0x1f3c, 0x2b06, 0x074e,
- 0x362a, 0x1218, 0x1e3c, 0x2a06, 0x064e,
- 0x362c, 0x121a, 0x1e3e, 0x2a08, 0x0650,
- 0x372c, 0x131a, 0x1f3e, 0x2b08, 0x0750,
- 0x382c, 0x141a, 0x203e, 0x2c08, 0x0850,
- 0x382e, 0x141c, 0x2040, 0x2c0a, 0x0852,
- 0x372e, 0x131c, 0x1f40, 0x2b0a, 0x0752,
- 0x362e, 0x121c, 0x1e40, 0x2a0a, 0x0652,
- 0x3630, 0x121e, 0x1e42, 0x2a0c, 0x0654,
- 0x3730, 0x131e, 0x1f42, 0x2b0c, 0x0754,
- 0x3830, 0x141e, 0x2042, 0x2c0c, 0x0854,
- 0x3832, 0x1420, 0x2044, 0x2c0e, 0x0856,
- 0x3732, 0x1320, 0x1f44, 0x2b0e, 0x0756,
- 0x3632, 0x1220, 0x1e44, 0x2a0e, 0x0656,
- 0x3634, 0x1222, 0x1e46, 0x2a10, 0x0658,
- 0x3734, 0x1322, 0x1f46, 0x2b10, 0x0758,
- 0x3834, 0x1422, 0x2046, 0x2c10, 0x0858,
- 0x0024, 0x1812, 0x2436, 0x3000, 0x0c48,
- 0x0124, 0x1912, 0x2536, 0x3100, 0x0d48,
- 0x0224, 0x1a12, 0x2636, 0x3200, 0x0e48,
- 0x0226, 0x1a14, 0x2638, 0x3202, 0x0e4a,
- 0x0126, 0x1914, 0x2538, 0x3102, 0x0d4a,
- 0x0026, 0x1814, 0x2438, 0x3002, 0x0c4a,
- 0x0028, 0x1816, 0x243a, 0x3004, 0x0c4c,
- 0x0128, 0x1916, 0x253a, 0x3104, 0x0d4c,
- 0x0228, 0x1a16, 0x263a, 0x3204, 0x0e4c,
- 0x022a, 0x1a18, 0x263c, 0x3206, 0x0e4e,
- 0x012a, 0x1918, 0x253c, 0x3106, 0x0d4e,
- 0x002a, 0x1818, 0x243c, 0x3006, 0x0c4e,
- 0x002c, 0x181a, 0x243e, 0x3008, 0x0c50,
- 0x012c, 0x191a, 0x253e, 0x3108, 0x0d50,
- 0x022c, 0x1a1a, 0x263e, 0x3208, 0x0e50,
- 0x022e, 0x1a1c, 0x2640, 0x320a, 0x0e52,
- 0x012e, 0x191c, 0x2540, 0x310a, 0x0d52,
- 0x002e, 0x181c, 0x2440, 0x300a, 0x0c52,
- 0x0030, 0x181e, 0x2442, 0x300c, 0x0c54,
- 0x0130, 0x191e, 0x2542, 0x310c, 0x0d54,
- 0x0230, 0x1a1e, 0x2642, 0x320c, 0x0e54,
- 0x0232, 0x1a20, 0x2644, 0x320e, 0x0e56,
- 0x0132, 0x1920, 0x2544, 0x310e, 0x0d56,
- 0x0032, 0x1820, 0x2444, 0x300e, 0x0c56,
- 0x0034, 0x1822, 0x2446, 0x3010, 0x0c58,
- 0x0134, 0x1922, 0x2546, 0x3110, 0x0d58,
- 0x0234, 0x1a22, 0x2646, 0x3210, 0x0e58,
- 0x0624, 0x1e12, 0x2a36, 0x3600, 0x1248,
- 0x0724, 0x1f12, 0x2b36, 0x3700, 0x1348,
- 0x0824, 0x2012, 0x2c36, 0x3800, 0x1448,
- 0x0826, 0x2014, 0x2c38, 0x3802, 0x144a,
- 0x0726, 0x1f14, 0x2b38, 0x3702, 0x134a,
- 0x0626, 0x1e14, 0x2a38, 0x3602, 0x124a,
- 0x0628, 0x1e16, 0x2a3a, 0x3604, 0x124c,
- 0x0728, 0x1f16, 0x2b3a, 0x3704, 0x134c,
- 0x0828, 0x2016, 0x2c3a, 0x3804, 0x144c,
- 0x082a, 0x2018, 0x2c3c, 0x3806, 0x144e,
- 0x072a, 0x1f18, 0x2b3c, 0x3706, 0x134e,
- 0x062a, 0x1e18, 0x2a3c, 0x3606, 0x124e,
- 0x062c, 0x1e1a, 0x2a3e, 0x3608, 0x1250,
- 0x072c, 0x1f1a, 0x2b3e, 0x3708, 0x1350,
- 0x082c, 0x201a, 0x2c3e, 0x3808, 0x1450,
- 0x082e, 0x201c, 0x2c40, 0x380a, 0x1452,
- 0x072e, 0x1f1c, 0x2b40, 0x370a, 0x1352,
- 0x062e, 0x1e1c, 0x2a40, 0x360a, 0x1252,
- 0x0630, 0x1e1e, 0x2a42, 0x360c, 0x1254,
- 0x0730, 0x1f1e, 0x2b42, 0x370c, 0x1354,
- 0x0830, 0x201e, 0x2c42, 0x380c, 0x1454,
- 0x0832, 0x2020, 0x2c44, 0x380e, 0x1456,
- 0x0732, 0x1f20, 0x2b44, 0x370e, 0x1356,
- 0x0632, 0x1e20, 0x2a44, 0x360e, 0x1256,
- 0x0634, 0x1e22, 0x2a46, 0x3610, 0x1258,
- 0x0734, 0x1f22, 0x2b46, 0x3710, 0x1358,
- 0x0834, 0x2022, 0x2c46, 0x3810, 0x1458,
- 0x0f24, 0x2712, 0x3336, 0x0300, 0x1b48,
- 0x1024, 0x2812, 0x3436, 0x0400, 0x1c48,
- 0x1124, 0x2912, 0x3536, 0x0500, 0x1d48,
- 0x1126, 0x2914, 0x3538, 0x0502, 0x1d4a,
- 0x1026, 0x2814, 0x3438, 0x0402, 0x1c4a,
- 0x0f26, 0x2714, 0x3338, 0x0302, 0x1b4a,
- 0x0f28, 0x2716, 0x333a, 0x0304, 0x1b4c,
- 0x1028, 0x2816, 0x343a, 0x0404, 0x1c4c,
- 0x1128, 0x2916, 0x353a, 0x0504, 0x1d4c,
- 0x112a, 0x2918, 0x353c, 0x0506, 0x1d4e,
- 0x102a, 0x2818, 0x343c, 0x0406, 0x1c4e,
- 0x0f2a, 0x2718, 0x333c, 0x0306, 0x1b4e,
- 0x0f2c, 0x271a, 0x333e, 0x0308, 0x1b50,
- 0x102c, 0x281a, 0x343e, 0x0408, 0x1c50,
- 0x112c, 0x291a, 0x353e, 0x0508, 0x1d50,
- 0x112e, 0x291c, 0x3540, 0x050a, 0x1d52,
- 0x102e, 0x281c, 0x3440, 0x040a, 0x1c52,
- 0x0f2e, 0x271c, 0x3340, 0x030a, 0x1b52,
- 0x0f30, 0x271e, 0x3342, 0x030c, 0x1b54,
- 0x1030, 0x281e, 0x3442, 0x040c, 0x1c54,
- 0x1130, 0x291e, 0x3542, 0x050c, 0x1d54,
- 0x1132, 0x2920, 0x3544, 0x050e, 0x1d56,
- 0x1032, 0x2820, 0x3444, 0x040e, 0x1c56,
- 0x0f32, 0x2720, 0x3344, 0x030e, 0x1b56,
- 0x0f34, 0x2722, 0x3346, 0x0310, 0x1b58,
- 0x1034, 0x2822, 0x3446, 0x0410, 0x1c58,
- 0x1134, 0x2922, 0x3546, 0x0510, 0x1d58,
- 0x1524, 0x2d12, 0x3936, 0x0900, 0x2148,
- 0x1624, 0x2e12, 0x3a36, 0x0a00, 0x2248,
- 0x1724, 0x2f12, 0x3b36, 0x0b00, 0x2348,
- 0x1726, 0x2f14, 0x3b38, 0x0b02, 0x234a,
- 0x1626, 0x2e14, 0x3a38, 0x0a02, 0x224a,
- 0x1526, 0x2d14, 0x3938, 0x0902, 0x214a,
- 0x1528, 0x2d16, 0x393a, 0x0904, 0x214c,
- 0x1628, 0x2e16, 0x3a3a, 0x0a04, 0x224c,
- 0x1728, 0x2f16, 0x3b3a, 0x0b04, 0x234c,
- 0x172a, 0x2f18, 0x3b3c, 0x0b06, 0x234e,
- 0x162a, 0x2e18, 0x3a3c, 0x0a06, 0x224e,
- 0x152a, 0x2d18, 0x393c, 0x0906, 0x214e,
- 0x152c, 0x2d1a, 0x393e, 0x0908, 0x2150,
- 0x162c, 0x2e1a, 0x3a3e, 0x0a08, 0x2250,
- 0x172c, 0x2f1a, 0x3b3e, 0x0b08, 0x2350,
- 0x172e, 0x2f1c, 0x3b40, 0x0b0a, 0x2352,
- 0x162e, 0x2e1c, 0x3a40, 0x0a0a, 0x2252,
- 0x152e, 0x2d1c, 0x3940, 0x090a, 0x2152,
- 0x1530, 0x2d1e, 0x3942, 0x090c, 0x2154,
- 0x1630, 0x2e1e, 0x3a42, 0x0a0c, 0x2254,
- 0x1730, 0x2f1e, 0x3b42, 0x0b0c, 0x2354,
- 0x1732, 0x2f20, 0x3b44, 0x0b0e, 0x2356,
- 0x1632, 0x2e20, 0x3a44, 0x0a0e, 0x2256,
- 0x1532, 0x2d20, 0x3944, 0x090e, 0x2156,
- 0x1534, 0x2d22, 0x3946, 0x0910, 0x2158,
- 0x1634, 0x2e22, 0x3a46, 0x0a10, 0x2258,
- 0x1734, 0x2f22, 0x3b46, 0x0b10, 0x2358,
- 0x1b24, 0x3312, 0x0336, 0x0f00, 0x2748,
- 0x1c24, 0x3412, 0x0436, 0x1000, 0x2848,
- 0x1d24, 0x3512, 0x0536, 0x1100, 0x2948,
- 0x1d26, 0x3514, 0x0538, 0x1102, 0x294a,
- 0x1c26, 0x3414, 0x0438, 0x1002, 0x284a,
- 0x1b26, 0x3314, 0x0338, 0x0f02, 0x274a,
- 0x1b28, 0x3316, 0x033a, 0x0f04, 0x274c,
- 0x1c28, 0x3416, 0x043a, 0x1004, 0x284c,
- 0x1d28, 0x3516, 0x053a, 0x1104, 0x294c,
- 0x1d2a, 0x3518, 0x053c, 0x1106, 0x294e,
- 0x1c2a, 0x3418, 0x043c, 0x1006, 0x284e,
- 0x1b2a, 0x3318, 0x033c, 0x0f06, 0x274e,
- 0x1b2c, 0x331a, 0x033e, 0x0f08, 0x2750,
- 0x1c2c, 0x341a, 0x043e, 0x1008, 0x2850,
- 0x1d2c, 0x351a, 0x053e, 0x1108, 0x2950,
- 0x1d2e, 0x351c, 0x0540, 0x110a, 0x2952,
- 0x1c2e, 0x341c, 0x0440, 0x100a, 0x2852,
- 0x1b2e, 0x331c, 0x0340, 0x0f0a, 0x2752,
- 0x1b30, 0x331e, 0x0342, 0x0f0c, 0x2754,
- 0x1c30, 0x341e, 0x0442, 0x100c, 0x2854,
- 0x1d30, 0x351e, 0x0542, 0x110c, 0x2954,
- 0x1d32, 0x3520, 0x0544, 0x110e, 0x2956,
- 0x1c32, 0x3420, 0x0444, 0x100e, 0x2856,
- 0x1b32, 0x3320, 0x0344, 0x0f0e, 0x2756,
- 0x1b34, 0x3322, 0x0346, 0x0f10, 0x2758,
- 0x1c34, 0x3422, 0x0446, 0x1010, 0x2858,
- 0x1d34, 0x3522, 0x0546, 0x1110, 0x2958,
- 0x2124, 0x3912, 0x0936, 0x1500, 0x2d48,
- 0x2224, 0x3a12, 0x0a36, 0x1600, 0x2e48,
- 0x2324, 0x3b12, 0x0b36, 0x1700, 0x2f48,
- 0x2326, 0x3b14, 0x0b38, 0x1702, 0x2f4a,
- 0x2226, 0x3a14, 0x0a38, 0x1602, 0x2e4a,
- 0x2126, 0x3914, 0x0938, 0x1502, 0x2d4a,
- 0x2128, 0x3916, 0x093a, 0x1504, 0x2d4c,
- 0x2228, 0x3a16, 0x0a3a, 0x1604, 0x2e4c,
- 0x2328, 0x3b16, 0x0b3a, 0x1704, 0x2f4c,
- 0x232a, 0x3b18, 0x0b3c, 0x1706, 0x2f4e,
- 0x222a, 0x3a18, 0x0a3c, 0x1606, 0x2e4e,
- 0x212a, 0x3918, 0x093c, 0x1506, 0x2d4e,
- 0x212c, 0x391a, 0x093e, 0x1508, 0x2d50,
- 0x222c, 0x3a1a, 0x0a3e, 0x1608, 0x2e50,
- 0x232c, 0x3b1a, 0x0b3e, 0x1708, 0x2f50,
- 0x232e, 0x3b1c, 0x0b40, 0x170a, 0x2f52,
- 0x222e, 0x3a1c, 0x0a40, 0x160a, 0x2e52,
- 0x212e, 0x391c, 0x0940, 0x150a, 0x2d52,
- 0x2130, 0x391e, 0x0942, 0x150c, 0x2d54,
- 0x2230, 0x3a1e, 0x0a42, 0x160c, 0x2e54,
- 0x2330, 0x3b1e, 0x0b42, 0x170c, 0x2f54,
- 0x2332, 0x3b20, 0x0b44, 0x170e, 0x2f56,
- 0x2232, 0x3a20, 0x0a44, 0x160e, 0x2e56,
- 0x2132, 0x3920, 0x0944, 0x150e, 0x2d56,
- 0x2134, 0x3922, 0x0946, 0x1510, 0x2d58,
- 0x2234, 0x3a22, 0x0a46, 0x1610, 0x2e58,
- 0x2334, 0x3b22, 0x0b46, 0x1710, 0x2f58,
- 0x2724, 0x0312, 0x0f36, 0x1b00, 0x3348,
- 0x2824, 0x0412, 0x1036, 0x1c00, 0x3448,
- 0x2924, 0x0512, 0x1136, 0x1d00, 0x3548,
- 0x2926, 0x0514, 0x1138, 0x1d02, 0x354a,
- 0x2826, 0x0414, 0x1038, 0x1c02, 0x344a,
- 0x2726, 0x0314, 0x0f38, 0x1b02, 0x334a,
- 0x2728, 0x0316, 0x0f3a, 0x1b04, 0x334c,
- 0x2828, 0x0416, 0x103a, 0x1c04, 0x344c,
- 0x2928, 0x0516, 0x113a, 0x1d04, 0x354c,
- 0x292a, 0x0518, 0x113c, 0x1d06, 0x354e,
- 0x282a, 0x0418, 0x103c, 0x1c06, 0x344e,
- 0x272a, 0x0318, 0x0f3c, 0x1b06, 0x334e,
- 0x272c, 0x031a, 0x0f3e, 0x1b08, 0x3350,
- 0x282c, 0x041a, 0x103e, 0x1c08, 0x3450,
- 0x292c, 0x051a, 0x113e, 0x1d08, 0x3550,
- 0x292e, 0x051c, 0x1140, 0x1d0a, 0x3552,
- 0x282e, 0x041c, 0x1040, 0x1c0a, 0x3452,
- 0x272e, 0x031c, 0x0f40, 0x1b0a, 0x3352,
- 0x2730, 0x031e, 0x0f42, 0x1b0c, 0x3354,
- 0x2830, 0x041e, 0x1042, 0x1c0c, 0x3454,
- 0x2930, 0x051e, 0x1142, 0x1d0c, 0x3554,
- 0x2932, 0x0520, 0x1144, 0x1d0e, 0x3556,
- 0x2832, 0x0420, 0x1044, 0x1c0e, 0x3456,
- 0x2732, 0x0320, 0x0f44, 0x1b0e, 0x3356,
- 0x2734, 0x0322, 0x0f46, 0x1b10, 0x3358,
- 0x2834, 0x0422, 0x1046, 0x1c10, 0x3458,
- 0x2934, 0x0522, 0x1146, 0x1d10, 0x3558,
- 0x2d24, 0x0912, 0x1536, 0x2100, 0x3948,
- 0x2e24, 0x0a12, 0x1636, 0x2200, 0x3a48,
- 0x2f24, 0x0b12, 0x1736, 0x2300, 0x3b48,
- 0x2f26, 0x0b14, 0x1738, 0x2302, 0x3b4a,
- 0x2e26, 0x0a14, 0x1638, 0x2202, 0x3a4a,
- 0x2d26, 0x0914, 0x1538, 0x2102, 0x394a,
- 0x2d28, 0x0916, 0x153a, 0x2104, 0x394c,
- 0x2e28, 0x0a16, 0x163a, 0x2204, 0x3a4c,
- 0x2f28, 0x0b16, 0x173a, 0x2304, 0x3b4c,
- 0x2f2a, 0x0b18, 0x173c, 0x2306, 0x3b4e,
- 0x2e2a, 0x0a18, 0x163c, 0x2206, 0x3a4e,
- 0x2d2a, 0x0918, 0x153c, 0x2106, 0x394e,
- 0x2d2c, 0x091a, 0x153e, 0x2108, 0x3950,
- 0x2e2c, 0x0a1a, 0x163e, 0x2208, 0x3a50,
- 0x2f2c, 0x0b1a, 0x173e, 0x2308, 0x3b50,
- 0x2f2e, 0x0b1c, 0x1740, 0x230a, 0x3b52,
- 0x2e2e, 0x0a1c, 0x1640, 0x220a, 0x3a52,
- 0x2d2e, 0x091c, 0x1540, 0x210a, 0x3952,
- 0x2d30, 0x091e, 0x1542, 0x210c, 0x3954,
- 0x2e30, 0x0a1e, 0x1642, 0x220c, 0x3a54,
- 0x2f30, 0x0b1e, 0x1742, 0x230c, 0x3b54,
- 0x2f32, 0x0b20, 0x1744, 0x230e, 0x3b56,
- 0x2e32, 0x0a20, 0x1644, 0x220e, 0x3a56,
- 0x2d32, 0x0920, 0x1544, 0x210e, 0x3956,
- 0x2d34, 0x0922, 0x1546, 0x2110, 0x3958,
- 0x2e34, 0x0a22, 0x1646, 0x2210, 0x3a58,
- 0x2f34, 0x0b22, 0x1746, 0x2310, 0x3b58,
- 0x3324, 0x0f12, 0x1b36, 0x2700, 0x0348,
- 0x3424, 0x1012, 0x1c36, 0x2800, 0x0448,
- 0x3524, 0x1112, 0x1d36, 0x2900, 0x0548,
- 0x3526, 0x1114, 0x1d38, 0x2902, 0x054a,
- 0x3426, 0x1014, 0x1c38, 0x2802, 0x044a,
- 0x3326, 0x0f14, 0x1b38, 0x2702, 0x034a,
- 0x3328, 0x0f16, 0x1b3a, 0x2704, 0x034c,
- 0x3428, 0x1016, 0x1c3a, 0x2804, 0x044c,
- 0x3528, 0x1116, 0x1d3a, 0x2904, 0x054c,
- 0x352a, 0x1118, 0x1d3c, 0x2906, 0x054e,
- 0x342a, 0x1018, 0x1c3c, 0x2806, 0x044e,
- 0x332a, 0x0f18, 0x1b3c, 0x2706, 0x034e,
- 0x332c, 0x0f1a, 0x1b3e, 0x2708, 0x0350,
- 0x342c, 0x101a, 0x1c3e, 0x2808, 0x0450,
- 0x352c, 0x111a, 0x1d3e, 0x2908, 0x0550,
- 0x352e, 0x111c, 0x1d40, 0x290a, 0x0552,
- 0x342e, 0x101c, 0x1c40, 0x280a, 0x0452,
- 0x332e, 0x0f1c, 0x1b40, 0x270a, 0x0352,
- 0x3330, 0x0f1e, 0x1b42, 0x270c, 0x0354,
- 0x3430, 0x101e, 0x1c42, 0x280c, 0x0454,
- 0x3530, 0x111e, 0x1d42, 0x290c, 0x0554,
- 0x3532, 0x1120, 0x1d44, 0x290e, 0x0556,
- 0x3432, 0x1020, 0x1c44, 0x280e, 0x0456,
- 0x3332, 0x0f20, 0x1b44, 0x270e, 0x0356,
- 0x3334, 0x0f22, 0x1b46, 0x2710, 0x0358,
- 0x3434, 0x1022, 0x1c46, 0x2810, 0x0458,
- 0x3534, 0x1122, 0x1d46, 0x2910, 0x0558,
- 0x3924, 0x1512, 0x2136, 0x2d00, 0x0948,
- 0x3a24, 0x1612, 0x2236, 0x2e00, 0x0a48,
- 0x3b24, 0x1712, 0x2336, 0x2f00, 0x0b48,
- 0x3b26, 0x1714, 0x2338, 0x2f02, 0x0b4a,
- 0x3a26, 0x1614, 0x2238, 0x2e02, 0x0a4a,
- 0x3926, 0x1514, 0x2138, 0x2d02, 0x094a,
- 0x3928, 0x1516, 0x213a, 0x2d04, 0x094c,
- 0x3a28, 0x1616, 0x223a, 0x2e04, 0x0a4c,
- 0x3b28, 0x1716, 0x233a, 0x2f04, 0x0b4c,
- 0x3b2a, 0x1718, 0x233c, 0x2f06, 0x0b4e,
- 0x3a2a, 0x1618, 0x223c, 0x2e06, 0x0a4e,
- 0x392a, 0x1518, 0x213c, 0x2d06, 0x094e,
- 0x392c, 0x151a, 0x213e, 0x2d08, 0x0950,
- 0x3a2c, 0x161a, 0x223e, 0x2e08, 0x0a50,
- 0x3b2c, 0x171a, 0x233e, 0x2f08, 0x0b50,
- 0x3b2e, 0x171c, 0x2340, 0x2f0a, 0x0b52,
- 0x3a2e, 0x161c, 0x2240, 0x2e0a, 0x0a52,
- 0x392e, 0x151c, 0x2140, 0x2d0a, 0x0952,
- 0x3930, 0x151e, 0x2142, 0x2d0c, 0x0954,
- 0x3a30, 0x161e, 0x2242, 0x2e0c, 0x0a54,
- 0x3b30, 0x171e, 0x2342, 0x2f0c, 0x0b54,
- 0x3b32, 0x1720, 0x2344, 0x2f0e, 0x0b56,
- 0x3a32, 0x1620, 0x2244, 0x2e0e, 0x0a56,
- 0x3932, 0x1520, 0x2144, 0x2d0e, 0x0956,
- 0x3934, 0x1522, 0x2146, 0x2d10, 0x0958,
- 0x3a34, 0x1622, 0x2246, 0x2e10, 0x0a58,
- 0x3b34, 0x1722, 0x2346, 0x2f10, 0x0b58,
- 0x0324, 0x1b12, 0x2736, 0x3300, 0x0f48,
- 0x0424, 0x1c12, 0x2836, 0x3400, 0x1048,
- 0x0524, 0x1d12, 0x2936, 0x3500, 0x1148,
- 0x0526, 0x1d14, 0x2938, 0x3502, 0x114a,
- 0x0426, 0x1c14, 0x2838, 0x3402, 0x104a,
- 0x0326, 0x1b14, 0x2738, 0x3302, 0x0f4a,
- 0x0328, 0x1b16, 0x273a, 0x3304, 0x0f4c,
- 0x0428, 0x1c16, 0x283a, 0x3404, 0x104c,
- 0x0528, 0x1d16, 0x293a, 0x3504, 0x114c,
- 0x052a, 0x1d18, 0x293c, 0x3506, 0x114e,
- 0x042a, 0x1c18, 0x283c, 0x3406, 0x104e,
- 0x032a, 0x1b18, 0x273c, 0x3306, 0x0f4e,
- 0x032c, 0x1b1a, 0x273e, 0x3308, 0x0f50,
- 0x042c, 0x1c1a, 0x283e, 0x3408, 0x1050,
- 0x052c, 0x1d1a, 0x293e, 0x3508, 0x1150,
- 0x052e, 0x1d1c, 0x2940, 0x350a, 0x1152,
- 0x042e, 0x1c1c, 0x2840, 0x340a, 0x1052,
- 0x032e, 0x1b1c, 0x2740, 0x330a, 0x0f52,
- 0x0330, 0x1b1e, 0x2742, 0x330c, 0x0f54,
- 0x0430, 0x1c1e, 0x2842, 0x340c, 0x1054,
- 0x0530, 0x1d1e, 0x2942, 0x350c, 0x1154,
- 0x0532, 0x1d20, 0x2944, 0x350e, 0x1156,
- 0x0432, 0x1c20, 0x2844, 0x340e, 0x1056,
- 0x0332, 0x1b20, 0x2744, 0x330e, 0x0f56,
- 0x0334, 0x1b22, 0x2746, 0x3310, 0x0f58,
- 0x0434, 0x1c22, 0x2846, 0x3410, 0x1058,
- 0x0534, 0x1d22, 0x2946, 0x3510, 0x1158,
- 0x0924, 0x2112, 0x2d36, 0x3900, 0x1548,
- 0x0a24, 0x2212, 0x2e36, 0x3a00, 0x1648,
- 0x0b24, 0x2312, 0x2f36, 0x3b00, 0x1748,
- 0x0b26, 0x2314, 0x2f38, 0x3b02, 0x174a,
- 0x0a26, 0x2214, 0x2e38, 0x3a02, 0x164a,
- 0x0926, 0x2114, 0x2d38, 0x3902, 0x154a,
- 0x0928, 0x2116, 0x2d3a, 0x3904, 0x154c,
- 0x0a28, 0x2216, 0x2e3a, 0x3a04, 0x164c,
- 0x0b28, 0x2316, 0x2f3a, 0x3b04, 0x174c,
- 0x0b2a, 0x2318, 0x2f3c, 0x3b06, 0x174e,
- 0x0a2a, 0x2218, 0x2e3c, 0x3a06, 0x164e,
- 0x092a, 0x2118, 0x2d3c, 0x3906, 0x154e,
- 0x092c, 0x211a, 0x2d3e, 0x3908, 0x1550,
- 0x0a2c, 0x221a, 0x2e3e, 0x3a08, 0x1650,
- 0x0b2c, 0x231a, 0x2f3e, 0x3b08, 0x1750,
- 0x0b2e, 0x231c, 0x2f40, 0x3b0a, 0x1752,
- 0x0a2e, 0x221c, 0x2e40, 0x3a0a, 0x1652,
- 0x092e, 0x211c, 0x2d40, 0x390a, 0x1552,
- 0x0930, 0x211e, 0x2d42, 0x390c, 0x1554,
- 0x0a30, 0x221e, 0x2e42, 0x3a0c, 0x1654,
- 0x0b30, 0x231e, 0x2f42, 0x3b0c, 0x1754,
- 0x0b32, 0x2320, 0x2f44, 0x3b0e, 0x1756,
- 0x0a32, 0x2220, 0x2e44, 0x3a0e, 0x1656,
- 0x0932, 0x2120, 0x2d44, 0x390e, 0x1556,
- 0x0934, 0x2122, 0x2d46, 0x3910, 0x1558,
- 0x0a34, 0x2222, 0x2e46, 0x3a10, 0x1658,
- 0x0b34, 0x2322, 0x2f46, 0x3b10, 0x1758,
-};
-
-/* 2 channels per frame, 12 DIF sequences per channel,
-   27 video segments per DIF sequence, 5 macroblocks per video segment */
-static const uint16_t dv_place_422_625[2*12*27*5] = {
- 0x0c24, 0x2412, 0x3036, 0x0000, 0x1848,
- 0x0d24, 0x2512, 0x3136, 0x0100, 0x1948,
- 0x0e24, 0x2612, 0x3236, 0x0200, 0x1a48,
- 0x0e26, 0x2614, 0x3238, 0x0202, 0x1a4a,
- 0x0d26, 0x2514, 0x3138, 0x0102, 0x194a,
- 0x0c26, 0x2414, 0x3038, 0x0002, 0x184a,
- 0x0c28, 0x2416, 0x303a, 0x0004, 0x184c,
- 0x0d28, 0x2516, 0x313a, 0x0104, 0x194c,
- 0x0e28, 0x2616, 0x323a, 0x0204, 0x1a4c,
- 0x0e2a, 0x2618, 0x323c, 0x0206, 0x1a4e,
- 0x0d2a, 0x2518, 0x313c, 0x0106, 0x194e,
- 0x0c2a, 0x2418, 0x303c, 0x0006, 0x184e,
- 0x0c2c, 0x241a, 0x303e, 0x0008, 0x1850,
- 0x0d2c, 0x251a, 0x313e, 0x0108, 0x1950,
- 0x0e2c, 0x261a, 0x323e, 0x0208, 0x1a50,
- 0x0e2e, 0x261c, 0x3240, 0x020a, 0x1a52,
- 0x0d2e, 0x251c, 0x3140, 0x010a, 0x1952,
- 0x0c2e, 0x241c, 0x3040, 0x000a, 0x1852,
- 0x0c30, 0x241e, 0x3042, 0x000c, 0x1854,
- 0x0d30, 0x251e, 0x3142, 0x010c, 0x1954,
- 0x0e30, 0x261e, 0x3242, 0x020c, 0x1a54,
- 0x0e32, 0x2620, 0x3244, 0x020e, 0x1a56,
- 0x0d32, 0x2520, 0x3144, 0x010e, 0x1956,
- 0x0c32, 0x2420, 0x3044, 0x000e, 0x1856,
- 0x0c34, 0x2422, 0x3046, 0x0010, 0x1858,
- 0x0d34, 0x2522, 0x3146, 0x0110, 0x1958,
- 0x0e34, 0x2622, 0x3246, 0x0210, 0x1a58,
- 0x1224, 0x2a12, 0x3636, 0x0600, 0x1e48,
- 0x1324, 0x2b12, 0x3736, 0x0700, 0x1f48,
- 0x1424, 0x2c12, 0x3836, 0x0800, 0x2048,
- 0x1426, 0x2c14, 0x3838, 0x0802, 0x204a,
- 0x1326, 0x2b14, 0x3738, 0x0702, 0x1f4a,
- 0x1226, 0x2a14, 0x3638, 0x0602, 0x1e4a,
- 0x1228, 0x2a16, 0x363a, 0x0604, 0x1e4c,
- 0x1328, 0x2b16, 0x373a, 0x0704, 0x1f4c,
- 0x1428, 0x2c16, 0x383a, 0x0804, 0x204c,
- 0x142a, 0x2c18, 0x383c, 0x0806, 0x204e,
- 0x132a, 0x2b18, 0x373c, 0x0706, 0x1f4e,
- 0x122a, 0x2a18, 0x363c, 0x0606, 0x1e4e,
- 0x122c, 0x2a1a, 0x363e, 0x0608, 0x1e50,
- 0x132c, 0x2b1a, 0x373e, 0x0708, 0x1f50,
- 0x142c, 0x2c1a, 0x383e, 0x0808, 0x2050,
- 0x142e, 0x2c1c, 0x3840, 0x080a, 0x2052,
- 0x132e, 0x2b1c, 0x3740, 0x070a, 0x1f52,
- 0x122e, 0x2a1c, 0x3640, 0x060a, 0x1e52,
- 0x1230, 0x2a1e, 0x3642, 0x060c, 0x1e54,
- 0x1330, 0x2b1e, 0x3742, 0x070c, 0x1f54,
- 0x1430, 0x2c1e, 0x3842, 0x080c, 0x2054,
- 0x1432, 0x2c20, 0x3844, 0x080e, 0x2056,
- 0x1332, 0x2b20, 0x3744, 0x070e, 0x1f56,
- 0x1232, 0x2a20, 0x3644, 0x060e, 0x1e56,
- 0x1234, 0x2a22, 0x3646, 0x0610, 0x1e58,
- 0x1334, 0x2b22, 0x3746, 0x0710, 0x1f58,
- 0x1434, 0x2c22, 0x3846, 0x0810, 0x2058,
- 0x1824, 0x3012, 0x3c36, 0x0c00, 0x2448,
- 0x1924, 0x3112, 0x3d36, 0x0d00, 0x2548,
- 0x1a24, 0x3212, 0x3e36, 0x0e00, 0x2648,
- 0x1a26, 0x3214, 0x3e38, 0x0e02, 0x264a,
- 0x1926, 0x3114, 0x3d38, 0x0d02, 0x254a,
- 0x1826, 0x3014, 0x3c38, 0x0c02, 0x244a,
- 0x1828, 0x3016, 0x3c3a, 0x0c04, 0x244c,
- 0x1928, 0x3116, 0x3d3a, 0x0d04, 0x254c,
- 0x1a28, 0x3216, 0x3e3a, 0x0e04, 0x264c,
- 0x1a2a, 0x3218, 0x3e3c, 0x0e06, 0x264e,
- 0x192a, 0x3118, 0x3d3c, 0x0d06, 0x254e,
- 0x182a, 0x3018, 0x3c3c, 0x0c06, 0x244e,
- 0x182c, 0x301a, 0x3c3e, 0x0c08, 0x2450,
- 0x192c, 0x311a, 0x3d3e, 0x0d08, 0x2550,
- 0x1a2c, 0x321a, 0x3e3e, 0x0e08, 0x2650,
- 0x1a2e, 0x321c, 0x3e40, 0x0e0a, 0x2652,
- 0x192e, 0x311c, 0x3d40, 0x0d0a, 0x2552,
- 0x182e, 0x301c, 0x3c40, 0x0c0a, 0x2452,
- 0x1830, 0x301e, 0x3c42, 0x0c0c, 0x2454,
- 0x1930, 0x311e, 0x3d42, 0x0d0c, 0x2554,
- 0x1a30, 0x321e, 0x3e42, 0x0e0c, 0x2654,
- 0x1a32, 0x3220, 0x3e44, 0x0e0e, 0x2656,
- 0x1932, 0x3120, 0x3d44, 0x0d0e, 0x2556,
- 0x1832, 0x3020, 0x3c44, 0x0c0e, 0x2456,
- 0x1834, 0x3022, 0x3c46, 0x0c10, 0x2458,
- 0x1934, 0x3122, 0x3d46, 0x0d10, 0x2558,
- 0x1a34, 0x3222, 0x3e46, 0x0e10, 0x2658,
- 0x1e24, 0x3612, 0x4236, 0x1200, 0x2a48,
- 0x1f24, 0x3712, 0x4336, 0x1300, 0x2b48,
- 0x2024, 0x3812, 0x4436, 0x1400, 0x2c48,
- 0x2026, 0x3814, 0x4438, 0x1402, 0x2c4a,
- 0x1f26, 0x3714, 0x4338, 0x1302, 0x2b4a,
- 0x1e26, 0x3614, 0x4238, 0x1202, 0x2a4a,
- 0x1e28, 0x3616, 0x423a, 0x1204, 0x2a4c,
- 0x1f28, 0x3716, 0x433a, 0x1304, 0x2b4c,
- 0x2028, 0x3816, 0x443a, 0x1404, 0x2c4c,
- 0x202a, 0x3818, 0x443c, 0x1406, 0x2c4e,
- 0x1f2a, 0x3718, 0x433c, 0x1306, 0x2b4e,
- 0x1e2a, 0x3618, 0x423c, 0x1206, 0x2a4e,
- 0x1e2c, 0x361a, 0x423e, 0x1208, 0x2a50,
- 0x1f2c, 0x371a, 0x433e, 0x1308, 0x2b50,
- 0x202c, 0x381a, 0x443e, 0x1408, 0x2c50,
- 0x202e, 0x381c, 0x4440, 0x140a, 0x2c52,
- 0x1f2e, 0x371c, 0x4340, 0x130a, 0x2b52,
- 0x1e2e, 0x361c, 0x4240, 0x120a, 0x2a52,
- 0x1e30, 0x361e, 0x4242, 0x120c, 0x2a54,
- 0x1f30, 0x371e, 0x4342, 0x130c, 0x2b54,
- 0x2030, 0x381e, 0x4442, 0x140c, 0x2c54,
- 0x2032, 0x3820, 0x4444, 0x140e, 0x2c56,
- 0x1f32, 0x3720, 0x4344, 0x130e, 0x2b56,
- 0x1e32, 0x3620, 0x4244, 0x120e, 0x2a56,
- 0x1e34, 0x3622, 0x4246, 0x1210, 0x2a58,
- 0x1f34, 0x3722, 0x4346, 0x1310, 0x2b58,
- 0x2034, 0x3822, 0x4446, 0x1410, 0x2c58,
- 0x2424, 0x3c12, 0x0036, 0x1800, 0x3048,
- 0x2524, 0x3d12, 0x0136, 0x1900, 0x3148,
- 0x2624, 0x3e12, 0x0236, 0x1a00, 0x3248,
- 0x2626, 0x3e14, 0x0238, 0x1a02, 0x324a,
- 0x2526, 0x3d14, 0x0138, 0x1902, 0x314a,
- 0x2426, 0x3c14, 0x0038, 0x1802, 0x304a,
- 0x2428, 0x3c16, 0x003a, 0x1804, 0x304c,
- 0x2528, 0x3d16, 0x013a, 0x1904, 0x314c,
- 0x2628, 0x3e16, 0x023a, 0x1a04, 0x324c,
- 0x262a, 0x3e18, 0x023c, 0x1a06, 0x324e,
- 0x252a, 0x3d18, 0x013c, 0x1906, 0x314e,
- 0x242a, 0x3c18, 0x003c, 0x1806, 0x304e,
- 0x242c, 0x3c1a, 0x003e, 0x1808, 0x3050,
- 0x252c, 0x3d1a, 0x013e, 0x1908, 0x3150,
- 0x262c, 0x3e1a, 0x023e, 0x1a08, 0x3250,
- 0x262e, 0x3e1c, 0x0240, 0x1a0a, 0x3252,
- 0x252e, 0x3d1c, 0x0140, 0x190a, 0x3152,
- 0x242e, 0x3c1c, 0x0040, 0x180a, 0x3052,
- 0x2430, 0x3c1e, 0x0042, 0x180c, 0x3054,
- 0x2530, 0x3d1e, 0x0142, 0x190c, 0x3154,
- 0x2630, 0x3e1e, 0x0242, 0x1a0c, 0x3254,
- 0x2632, 0x3e20, 0x0244, 0x1a0e, 0x3256,
- 0x2532, 0x3d20, 0x0144, 0x190e, 0x3156,
- 0x2432, 0x3c20, 0x0044, 0x180e, 0x3056,
- 0x2434, 0x3c22, 0x0046, 0x1810, 0x3058,
- 0x2534, 0x3d22, 0x0146, 0x1910, 0x3158,
- 0x2634, 0x3e22, 0x0246, 0x1a10, 0x3258,
- 0x2a24, 0x4212, 0x0636, 0x1e00, 0x3648,
- 0x2b24, 0x4312, 0x0736, 0x1f00, 0x3748,
- 0x2c24, 0x4412, 0x0836, 0x2000, 0x3848,
- 0x2c26, 0x4414, 0x0838, 0x2002, 0x384a,
- 0x2b26, 0x4314, 0x0738, 0x1f02, 0x374a,
- 0x2a26, 0x4214, 0x0638, 0x1e02, 0x364a,
- 0x2a28, 0x4216, 0x063a, 0x1e04, 0x364c,
- 0x2b28, 0x4316, 0x073a, 0x1f04, 0x374c,
- 0x2c28, 0x4416, 0x083a, 0x2004, 0x384c,
- 0x2c2a, 0x4418, 0x083c, 0x2006, 0x384e,
- 0x2b2a, 0x4318, 0x073c, 0x1f06, 0x374e,
- 0x2a2a, 0x4218, 0x063c, 0x1e06, 0x364e,
- 0x2a2c, 0x421a, 0x063e, 0x1e08, 0x3650,
- 0x2b2c, 0x431a, 0x073e, 0x1f08, 0x3750,
- 0x2c2c, 0x441a, 0x083e, 0x2008, 0x3850,
- 0x2c2e, 0x441c, 0x0840, 0x200a, 0x3852,
- 0x2b2e, 0x431c, 0x0740, 0x1f0a, 0x3752,
- 0x2a2e, 0x421c, 0x0640, 0x1e0a, 0x3652,
- 0x2a30, 0x421e, 0x0642, 0x1e0c, 0x3654,
- 0x2b30, 0x431e, 0x0742, 0x1f0c, 0x3754,
- 0x2c30, 0x441e, 0x0842, 0x200c, 0x3854,
- 0x2c32, 0x4420, 0x0844, 0x200e, 0x3856,
- 0x2b32, 0x4320, 0x0744, 0x1f0e, 0x3756,
- 0x2a32, 0x4220, 0x0644, 0x1e0e, 0x3656,
- 0x2a34, 0x4222, 0x0646, 0x1e10, 0x3658,
- 0x2b34, 0x4322, 0x0746, 0x1f10, 0x3758,
- 0x2c34, 0x4422, 0x0846, 0x2010, 0x3858,
- 0x3024, 0x0012, 0x0c36, 0x2400, 0x3c48,
- 0x3124, 0x0112, 0x0d36, 0x2500, 0x3d48,
- 0x3224, 0x0212, 0x0e36, 0x2600, 0x3e48,
- 0x3226, 0x0214, 0x0e38, 0x2602, 0x3e4a,
- 0x3126, 0x0114, 0x0d38, 0x2502, 0x3d4a,
- 0x3026, 0x0014, 0x0c38, 0x2402, 0x3c4a,
- 0x3028, 0x0016, 0x0c3a, 0x2404, 0x3c4c,
- 0x3128, 0x0116, 0x0d3a, 0x2504, 0x3d4c,
- 0x3228, 0x0216, 0x0e3a, 0x2604, 0x3e4c,
- 0x322a, 0x0218, 0x0e3c, 0x2606, 0x3e4e,
- 0x312a, 0x0118, 0x0d3c, 0x2506, 0x3d4e,
- 0x302a, 0x0018, 0x0c3c, 0x2406, 0x3c4e,
- 0x302c, 0x001a, 0x0c3e, 0x2408, 0x3c50,
- 0x312c, 0x011a, 0x0d3e, 0x2508, 0x3d50,
- 0x322c, 0x021a, 0x0e3e, 0x2608, 0x3e50,
- 0x322e, 0x021c, 0x0e40, 0x260a, 0x3e52,
- 0x312e, 0x011c, 0x0d40, 0x250a, 0x3d52,
- 0x302e, 0x001c, 0x0c40, 0x240a, 0x3c52,
- 0x3030, 0x001e, 0x0c42, 0x240c, 0x3c54,
- 0x3130, 0x011e, 0x0d42, 0x250c, 0x3d54,
- 0x3230, 0x021e, 0x0e42, 0x260c, 0x3e54,
- 0x3232, 0x0220, 0x0e44, 0x260e, 0x3e56,
- 0x3132, 0x0120, 0x0d44, 0x250e, 0x3d56,
- 0x3032, 0x0020, 0x0c44, 0x240e, 0x3c56,
- 0x3034, 0x0022, 0x0c46, 0x2410, 0x3c58,
- 0x3134, 0x0122, 0x0d46, 0x2510, 0x3d58,
- 0x3234, 0x0222, 0x0e46, 0x2610, 0x3e58,
- 0x3624, 0x0612, 0x1236, 0x2a00, 0x4248,
- 0x3724, 0x0712, 0x1336, 0x2b00, 0x4348,
- 0x3824, 0x0812, 0x1436, 0x2c00, 0x4448,
- 0x3826, 0x0814, 0x1438, 0x2c02, 0x444a,
- 0x3726, 0x0714, 0x1338, 0x2b02, 0x434a,
- 0x3626, 0x0614, 0x1238, 0x2a02, 0x424a,
- 0x3628, 0x0616, 0x123a, 0x2a04, 0x424c,
- 0x3728, 0x0716, 0x133a, 0x2b04, 0x434c,
- 0x3828, 0x0816, 0x143a, 0x2c04, 0x444c,
- 0x382a, 0x0818, 0x143c, 0x2c06, 0x444e,
- 0x372a, 0x0718, 0x133c, 0x2b06, 0x434e,
- 0x362a, 0x0618, 0x123c, 0x2a06, 0x424e,
- 0x362c, 0x061a, 0x123e, 0x2a08, 0x4250,
- 0x372c, 0x071a, 0x133e, 0x2b08, 0x4350,
- 0x382c, 0x081a, 0x143e, 0x2c08, 0x4450,
- 0x382e, 0x081c, 0x1440, 0x2c0a, 0x4452,
- 0x372e, 0x071c, 0x1340, 0x2b0a, 0x4352,
- 0x362e, 0x061c, 0x1240, 0x2a0a, 0x4252,
- 0x3630, 0x061e, 0x1242, 0x2a0c, 0x4254,
- 0x3730, 0x071e, 0x1342, 0x2b0c, 0x4354,
- 0x3830, 0x081e, 0x1442, 0x2c0c, 0x4454,
- 0x3832, 0x0820, 0x1444, 0x2c0e, 0x4456,
- 0x3732, 0x0720, 0x1344, 0x2b0e, 0x4356,
- 0x3632, 0x0620, 0x1244, 0x2a0e, 0x4256,
- 0x3634, 0x0622, 0x1246, 0x2a10, 0x4258,
- 0x3734, 0x0722, 0x1346, 0x2b10, 0x4358,
- 0x3834, 0x0822, 0x1446, 0x2c10, 0x4458,
- 0x3c24, 0x0c12, 0x1836, 0x3000, 0x0048,
- 0x3d24, 0x0d12, 0x1936, 0x3100, 0x0148,
- 0x3e24, 0x0e12, 0x1a36, 0x3200, 0x0248,
- 0x3e26, 0x0e14, 0x1a38, 0x3202, 0x024a,
- 0x3d26, 0x0d14, 0x1938, 0x3102, 0x014a,
- 0x3c26, 0x0c14, 0x1838, 0x3002, 0x004a,
- 0x3c28, 0x0c16, 0x183a, 0x3004, 0x004c,
- 0x3d28, 0x0d16, 0x193a, 0x3104, 0x014c,
- 0x3e28, 0x0e16, 0x1a3a, 0x3204, 0x024c,
- 0x3e2a, 0x0e18, 0x1a3c, 0x3206, 0x024e,
- 0x3d2a, 0x0d18, 0x193c, 0x3106, 0x014e,
- 0x3c2a, 0x0c18, 0x183c, 0x3006, 0x004e,
- 0x3c2c, 0x0c1a, 0x183e, 0x3008, 0x0050,
- 0x3d2c, 0x0d1a, 0x193e, 0x3108, 0x0150,
- 0x3e2c, 0x0e1a, 0x1a3e, 0x3208, 0x0250,
- 0x3e2e, 0x0e1c, 0x1a40, 0x320a, 0x0252,
- 0x3d2e, 0x0d1c, 0x1940, 0x310a, 0x0152,
- 0x3c2e, 0x0c1c, 0x1840, 0x300a, 0x0052,
- 0x3c30, 0x0c1e, 0x1842, 0x300c, 0x0054,
- 0x3d30, 0x0d1e, 0x1942, 0x310c, 0x0154,
- 0x3e30, 0x0e1e, 0x1a42, 0x320c, 0x0254,
- 0x3e32, 0x0e20, 0x1a44, 0x320e, 0x0256,
- 0x3d32, 0x0d20, 0x1944, 0x310e, 0x0156,
- 0x3c32, 0x0c20, 0x1844, 0x300e, 0x0056,
- 0x3c34, 0x0c22, 0x1846, 0x3010, 0x0058,
- 0x3d34, 0x0d22, 0x1946, 0x3110, 0x0158,
- 0x3e34, 0x0e22, 0x1a46, 0x3210, 0x0258,
- 0x4224, 0x1212, 0x1e36, 0x3600, 0x0648,
- 0x4324, 0x1312, 0x1f36, 0x3700, 0x0748,
- 0x4424, 0x1412, 0x2036, 0x3800, 0x0848,
- 0x4426, 0x1414, 0x2038, 0x3802, 0x084a,
- 0x4326, 0x1314, 0x1f38, 0x3702, 0x074a,
- 0x4226, 0x1214, 0x1e38, 0x3602, 0x064a,
- 0x4228, 0x1216, 0x1e3a, 0x3604, 0x064c,
- 0x4328, 0x1316, 0x1f3a, 0x3704, 0x074c,
- 0x4428, 0x1416, 0x203a, 0x3804, 0x084c,
- 0x442a, 0x1418, 0x203c, 0x3806, 0x084e,
- 0x432a, 0x1318, 0x1f3c, 0x3706, 0x074e,
- 0x422a, 0x1218, 0x1e3c, 0x3606, 0x064e,
- 0x422c, 0x121a, 0x1e3e, 0x3608, 0x0650,
- 0x432c, 0x131a, 0x1f3e, 0x3708, 0x0750,
- 0x442c, 0x141a, 0x203e, 0x3808, 0x0850,
- 0x442e, 0x141c, 0x2040, 0x380a, 0x0852,
- 0x432e, 0x131c, 0x1f40, 0x370a, 0x0752,
- 0x422e, 0x121c, 0x1e40, 0x360a, 0x0652,
- 0x4230, 0x121e, 0x1e42, 0x360c, 0x0654,
- 0x4330, 0x131e, 0x1f42, 0x370c, 0x0754,
- 0x4430, 0x141e, 0x2042, 0x380c, 0x0854,
- 0x4432, 0x1420, 0x2044, 0x380e, 0x0856,
- 0x4332, 0x1320, 0x1f44, 0x370e, 0x0756,
- 0x4232, 0x1220, 0x1e44, 0x360e, 0x0656,
- 0x4234, 0x1222, 0x1e46, 0x3610, 0x0658,
- 0x4334, 0x1322, 0x1f46, 0x3710, 0x0758,
- 0x4434, 0x1422, 0x2046, 0x3810, 0x0858,
- 0x0024, 0x1812, 0x2436, 0x3c00, 0x0c48,
- 0x0124, 0x1912, 0x2536, 0x3d00, 0x0d48,
- 0x0224, 0x1a12, 0x2636, 0x3e00, 0x0e48,
- 0x0226, 0x1a14, 0x2638, 0x3e02, 0x0e4a,
- 0x0126, 0x1914, 0x2538, 0x3d02, 0x0d4a,
- 0x0026, 0x1814, 0x2438, 0x3c02, 0x0c4a,
- 0x0028, 0x1816, 0x243a, 0x3c04, 0x0c4c,
- 0x0128, 0x1916, 0x253a, 0x3d04, 0x0d4c,
- 0x0228, 0x1a16, 0x263a, 0x3e04, 0x0e4c,
- 0x022a, 0x1a18, 0x263c, 0x3e06, 0x0e4e,
- 0x012a, 0x1918, 0x253c, 0x3d06, 0x0d4e,
- 0x002a, 0x1818, 0x243c, 0x3c06, 0x0c4e,
- 0x002c, 0x181a, 0x243e, 0x3c08, 0x0c50,
- 0x012c, 0x191a, 0x253e, 0x3d08, 0x0d50,
- 0x022c, 0x1a1a, 0x263e, 0x3e08, 0x0e50,
- 0x022e, 0x1a1c, 0x2640, 0x3e0a, 0x0e52,
- 0x012e, 0x191c, 0x2540, 0x3d0a, 0x0d52,
- 0x002e, 0x181c, 0x2440, 0x3c0a, 0x0c52,
- 0x0030, 0x181e, 0x2442, 0x3c0c, 0x0c54,
- 0x0130, 0x191e, 0x2542, 0x3d0c, 0x0d54,
- 0x0230, 0x1a1e, 0x2642, 0x3e0c, 0x0e54,
- 0x0232, 0x1a20, 0x2644, 0x3e0e, 0x0e56,
- 0x0132, 0x1920, 0x2544, 0x3d0e, 0x0d56,
- 0x0032, 0x1820, 0x2444, 0x3c0e, 0x0c56,
- 0x0034, 0x1822, 0x2446, 0x3c10, 0x0c58,
- 0x0134, 0x1922, 0x2546, 0x3d10, 0x0d58,
- 0x0234, 0x1a22, 0x2646, 0x3e10, 0x0e58,
- 0x0624, 0x1e12, 0x2a36, 0x4200, 0x1248,
- 0x0724, 0x1f12, 0x2b36, 0x4300, 0x1348,
- 0x0824, 0x2012, 0x2c36, 0x4400, 0x1448,
- 0x0826, 0x2014, 0x2c38, 0x4402, 0x144a,
- 0x0726, 0x1f14, 0x2b38, 0x4302, 0x134a,
- 0x0626, 0x1e14, 0x2a38, 0x4202, 0x124a,
- 0x0628, 0x1e16, 0x2a3a, 0x4204, 0x124c,
- 0x0728, 0x1f16, 0x2b3a, 0x4304, 0x134c,
- 0x0828, 0x2016, 0x2c3a, 0x4404, 0x144c,
- 0x082a, 0x2018, 0x2c3c, 0x4406, 0x144e,
- 0x072a, 0x1f18, 0x2b3c, 0x4306, 0x134e,
- 0x062a, 0x1e18, 0x2a3c, 0x4206, 0x124e,
- 0x062c, 0x1e1a, 0x2a3e, 0x4208, 0x1250,
- 0x072c, 0x1f1a, 0x2b3e, 0x4308, 0x1350,
- 0x082c, 0x201a, 0x2c3e, 0x4408, 0x1450,
- 0x082e, 0x201c, 0x2c40, 0x440a, 0x1452,
- 0x072e, 0x1f1c, 0x2b40, 0x430a, 0x1352,
- 0x062e, 0x1e1c, 0x2a40, 0x420a, 0x1252,
- 0x0630, 0x1e1e, 0x2a42, 0x420c, 0x1254,
- 0x0730, 0x1f1e, 0x2b42, 0x430c, 0x1354,
- 0x0830, 0x201e, 0x2c42, 0x440c, 0x1454,
- 0x0832, 0x2020, 0x2c44, 0x440e, 0x1456,
- 0x0732, 0x1f20, 0x2b44, 0x430e, 0x1356,
- 0x0632, 0x1e20, 0x2a44, 0x420e, 0x1256,
- 0x0634, 0x1e22, 0x2a46, 0x4210, 0x1258,
- 0x0734, 0x1f22, 0x2b46, 0x4310, 0x1358,
- 0x0834, 0x2022, 0x2c46, 0x4410, 0x1458,
- 0x0f24, 0x2712, 0x3336, 0x0300, 0x1b48,
- 0x1024, 0x2812, 0x3436, 0x0400, 0x1c48,
- 0x1124, 0x2912, 0x3536, 0x0500, 0x1d48,
- 0x1126, 0x2914, 0x3538, 0x0502, 0x1d4a,
- 0x1026, 0x2814, 0x3438, 0x0402, 0x1c4a,
- 0x0f26, 0x2714, 0x3338, 0x0302, 0x1b4a,
- 0x0f28, 0x2716, 0x333a, 0x0304, 0x1b4c,
- 0x1028, 0x2816, 0x343a, 0x0404, 0x1c4c,
- 0x1128, 0x2916, 0x353a, 0x0504, 0x1d4c,
- 0x112a, 0x2918, 0x353c, 0x0506, 0x1d4e,
- 0x102a, 0x2818, 0x343c, 0x0406, 0x1c4e,
- 0x0f2a, 0x2718, 0x333c, 0x0306, 0x1b4e,
- 0x0f2c, 0x271a, 0x333e, 0x0308, 0x1b50,
- 0x102c, 0x281a, 0x343e, 0x0408, 0x1c50,
- 0x112c, 0x291a, 0x353e, 0x0508, 0x1d50,
- 0x112e, 0x291c, 0x3540, 0x050a, 0x1d52,
- 0x102e, 0x281c, 0x3440, 0x040a, 0x1c52,
- 0x0f2e, 0x271c, 0x3340, 0x030a, 0x1b52,
- 0x0f30, 0x271e, 0x3342, 0x030c, 0x1b54,
- 0x1030, 0x281e, 0x3442, 0x040c, 0x1c54,
- 0x1130, 0x291e, 0x3542, 0x050c, 0x1d54,
- 0x1132, 0x2920, 0x3544, 0x050e, 0x1d56,
- 0x1032, 0x2820, 0x3444, 0x040e, 0x1c56,
- 0x0f32, 0x2720, 0x3344, 0x030e, 0x1b56,
- 0x0f34, 0x2722, 0x3346, 0x0310, 0x1b58,
- 0x1034, 0x2822, 0x3446, 0x0410, 0x1c58,
- 0x1134, 0x2922, 0x3546, 0x0510, 0x1d58,
- 0x1524, 0x2d12, 0x3936, 0x0900, 0x2148,
- 0x1624, 0x2e12, 0x3a36, 0x0a00, 0x2248,
- 0x1724, 0x2f12, 0x3b36, 0x0b00, 0x2348,
- 0x1726, 0x2f14, 0x3b38, 0x0b02, 0x234a,
- 0x1626, 0x2e14, 0x3a38, 0x0a02, 0x224a,
- 0x1526, 0x2d14, 0x3938, 0x0902, 0x214a,
- 0x1528, 0x2d16, 0x393a, 0x0904, 0x214c,
- 0x1628, 0x2e16, 0x3a3a, 0x0a04, 0x224c,
- 0x1728, 0x2f16, 0x3b3a, 0x0b04, 0x234c,
- 0x172a, 0x2f18, 0x3b3c, 0x0b06, 0x234e,
- 0x162a, 0x2e18, 0x3a3c, 0x0a06, 0x224e,
- 0x152a, 0x2d18, 0x393c, 0x0906, 0x214e,
- 0x152c, 0x2d1a, 0x393e, 0x0908, 0x2150,
- 0x162c, 0x2e1a, 0x3a3e, 0x0a08, 0x2250,
- 0x172c, 0x2f1a, 0x3b3e, 0x0b08, 0x2350,
- 0x172e, 0x2f1c, 0x3b40, 0x0b0a, 0x2352,
- 0x162e, 0x2e1c, 0x3a40, 0x0a0a, 0x2252,
- 0x152e, 0x2d1c, 0x3940, 0x090a, 0x2152,
- 0x1530, 0x2d1e, 0x3942, 0x090c, 0x2154,
- 0x1630, 0x2e1e, 0x3a42, 0x0a0c, 0x2254,
- 0x1730, 0x2f1e, 0x3b42, 0x0b0c, 0x2354,
- 0x1732, 0x2f20, 0x3b44, 0x0b0e, 0x2356,
- 0x1632, 0x2e20, 0x3a44, 0x0a0e, 0x2256,
- 0x1532, 0x2d20, 0x3944, 0x090e, 0x2156,
- 0x1534, 0x2d22, 0x3946, 0x0910, 0x2158,
- 0x1634, 0x2e22, 0x3a46, 0x0a10, 0x2258,
- 0x1734, 0x2f22, 0x3b46, 0x0b10, 0x2358,
- 0x1b24, 0x3312, 0x3f36, 0x0f00, 0x2748,
- 0x1c24, 0x3412, 0x4036, 0x1000, 0x2848,
- 0x1d24, 0x3512, 0x4136, 0x1100, 0x2948,
- 0x1d26, 0x3514, 0x4138, 0x1102, 0x294a,
- 0x1c26, 0x3414, 0x4038, 0x1002, 0x284a,
- 0x1b26, 0x3314, 0x3f38, 0x0f02, 0x274a,
- 0x1b28, 0x3316, 0x3f3a, 0x0f04, 0x274c,
- 0x1c28, 0x3416, 0x403a, 0x1004, 0x284c,
- 0x1d28, 0x3516, 0x413a, 0x1104, 0x294c,
- 0x1d2a, 0x3518, 0x413c, 0x1106, 0x294e,
- 0x1c2a, 0x3418, 0x403c, 0x1006, 0x284e,
- 0x1b2a, 0x3318, 0x3f3c, 0x0f06, 0x274e,
- 0x1b2c, 0x331a, 0x3f3e, 0x0f08, 0x2750,
- 0x1c2c, 0x341a, 0x403e, 0x1008, 0x2850,
- 0x1d2c, 0x351a, 0x413e, 0x1108, 0x2950,
- 0x1d2e, 0x351c, 0x4140, 0x110a, 0x2952,
- 0x1c2e, 0x341c, 0x4040, 0x100a, 0x2852,
- 0x1b2e, 0x331c, 0x3f40, 0x0f0a, 0x2752,
- 0x1b30, 0x331e, 0x3f42, 0x0f0c, 0x2754,
- 0x1c30, 0x341e, 0x4042, 0x100c, 0x2854,
- 0x1d30, 0x351e, 0x4142, 0x110c, 0x2954,
- 0x1d32, 0x3520, 0x4144, 0x110e, 0x2956,
- 0x1c32, 0x3420, 0x4044, 0x100e, 0x2856,
- 0x1b32, 0x3320, 0x3f44, 0x0f0e, 0x2756,
- 0x1b34, 0x3322, 0x3f46, 0x0f10, 0x2758,
- 0x1c34, 0x3422, 0x4046, 0x1010, 0x2858,
- 0x1d34, 0x3522, 0x4146, 0x1110, 0x2958,
- 0x2124, 0x3912, 0x4536, 0x1500, 0x2d48,
- 0x2224, 0x3a12, 0x4636, 0x1600, 0x2e48,
- 0x2324, 0x3b12, 0x4736, 0x1700, 0x2f48,
- 0x2326, 0x3b14, 0x4738, 0x1702, 0x2f4a,
- 0x2226, 0x3a14, 0x4638, 0x1602, 0x2e4a,
- 0x2126, 0x3914, 0x4538, 0x1502, 0x2d4a,
- 0x2128, 0x3916, 0x453a, 0x1504, 0x2d4c,
- 0x2228, 0x3a16, 0x463a, 0x1604, 0x2e4c,
- 0x2328, 0x3b16, 0x473a, 0x1704, 0x2f4c,
- 0x232a, 0x3b18, 0x473c, 0x1706, 0x2f4e,
- 0x222a, 0x3a18, 0x463c, 0x1606, 0x2e4e,
- 0x212a, 0x3918, 0x453c, 0x1506, 0x2d4e,
- 0x212c, 0x391a, 0x453e, 0x1508, 0x2d50,
- 0x222c, 0x3a1a, 0x463e, 0x1608, 0x2e50,
- 0x232c, 0x3b1a, 0x473e, 0x1708, 0x2f50,
- 0x232e, 0x3b1c, 0x4740, 0x170a, 0x2f52,
- 0x222e, 0x3a1c, 0x4640, 0x160a, 0x2e52,
- 0x212e, 0x391c, 0x4540, 0x150a, 0x2d52,
- 0x2130, 0x391e, 0x4542, 0x150c, 0x2d54,
- 0x2230, 0x3a1e, 0x4642, 0x160c, 0x2e54,
- 0x2330, 0x3b1e, 0x4742, 0x170c, 0x2f54,
- 0x2332, 0x3b20, 0x4744, 0x170e, 0x2f56,
- 0x2232, 0x3a20, 0x4644, 0x160e, 0x2e56,
- 0x2132, 0x3920, 0x4544, 0x150e, 0x2d56,
- 0x2134, 0x3922, 0x4546, 0x1510, 0x2d58,
- 0x2234, 0x3a22, 0x4646, 0x1610, 0x2e58,
- 0x2334, 0x3b22, 0x4746, 0x1710, 0x2f58,
- 0x2724, 0x3f12, 0x0336, 0x1b00, 0x3348,
- 0x2824, 0x4012, 0x0436, 0x1c00, 0x3448,
- 0x2924, 0x4112, 0x0536, 0x1d00, 0x3548,
- 0x2926, 0x4114, 0x0538, 0x1d02, 0x354a,
- 0x2826, 0x4014, 0x0438, 0x1c02, 0x344a,
- 0x2726, 0x3f14, 0x0338, 0x1b02, 0x334a,
- 0x2728, 0x3f16, 0x033a, 0x1b04, 0x334c,
- 0x2828, 0x4016, 0x043a, 0x1c04, 0x344c,
- 0x2928, 0x4116, 0x053a, 0x1d04, 0x354c,
- 0x292a, 0x4118, 0x053c, 0x1d06, 0x354e,
- 0x282a, 0x4018, 0x043c, 0x1c06, 0x344e,
- 0x272a, 0x3f18, 0x033c, 0x1b06, 0x334e,
- 0x272c, 0x3f1a, 0x033e, 0x1b08, 0x3350,
- 0x282c, 0x401a, 0x043e, 0x1c08, 0x3450,
- 0x292c, 0x411a, 0x053e, 0x1d08, 0x3550,
- 0x292e, 0x411c, 0x0540, 0x1d0a, 0x3552,
- 0x282e, 0x401c, 0x0440, 0x1c0a, 0x3452,
- 0x272e, 0x3f1c, 0x0340, 0x1b0a, 0x3352,
- 0x2730, 0x3f1e, 0x0342, 0x1b0c, 0x3354,
- 0x2830, 0x401e, 0x0442, 0x1c0c, 0x3454,
- 0x2930, 0x411e, 0x0542, 0x1d0c, 0x3554,
- 0x2932, 0x4120, 0x0544, 0x1d0e, 0x3556,
- 0x2832, 0x4020, 0x0444, 0x1c0e, 0x3456,
- 0x2732, 0x3f20, 0x0344, 0x1b0e, 0x3356,
- 0x2734, 0x3f22, 0x0346, 0x1b10, 0x3358,
- 0x2834, 0x4022, 0x0446, 0x1c10, 0x3458,
- 0x2934, 0x4122, 0x0546, 0x1d10, 0x3558,
- 0x2d24, 0x4512, 0x0936, 0x2100, 0x3948,
- 0x2e24, 0x4612, 0x0a36, 0x2200, 0x3a48,
- 0x2f24, 0x4712, 0x0b36, 0x2300, 0x3b48,
- 0x2f26, 0x4714, 0x0b38, 0x2302, 0x3b4a,
- 0x2e26, 0x4614, 0x0a38, 0x2202, 0x3a4a,
- 0x2d26, 0x4514, 0x0938, 0x2102, 0x394a,
- 0x2d28, 0x4516, 0x093a, 0x2104, 0x394c,
- 0x2e28, 0x4616, 0x0a3a, 0x2204, 0x3a4c,
- 0x2f28, 0x4716, 0x0b3a, 0x2304, 0x3b4c,
- 0x2f2a, 0x4718, 0x0b3c, 0x2306, 0x3b4e,
- 0x2e2a, 0x4618, 0x0a3c, 0x2206, 0x3a4e,
- 0x2d2a, 0x4518, 0x093c, 0x2106, 0x394e,
- 0x2d2c, 0x451a, 0x093e, 0x2108, 0x3950,
- 0x2e2c, 0x461a, 0x0a3e, 0x2208, 0x3a50,
- 0x2f2c, 0x471a, 0x0b3e, 0x2308, 0x3b50,
- 0x2f2e, 0x471c, 0x0b40, 0x230a, 0x3b52,
- 0x2e2e, 0x461c, 0x0a40, 0x220a, 0x3a52,
- 0x2d2e, 0x451c, 0x0940, 0x210a, 0x3952,
- 0x2d30, 0x451e, 0x0942, 0x210c, 0x3954,
- 0x2e30, 0x461e, 0x0a42, 0x220c, 0x3a54,
- 0x2f30, 0x471e, 0x0b42, 0x230c, 0x3b54,
- 0x2f32, 0x4720, 0x0b44, 0x230e, 0x3b56,
- 0x2e32, 0x4620, 0x0a44, 0x220e, 0x3a56,
- 0x2d32, 0x4520, 0x0944, 0x210e, 0x3956,
- 0x2d34, 0x4522, 0x0946, 0x2110, 0x3958,
- 0x2e34, 0x4622, 0x0a46, 0x2210, 0x3a58,
- 0x2f34, 0x4722, 0x0b46, 0x2310, 0x3b58,
- 0x3324, 0x0312, 0x0f36, 0x2700, 0x3f48,
- 0x3424, 0x0412, 0x1036, 0x2800, 0x4048,
- 0x3524, 0x0512, 0x1136, 0x2900, 0x4148,
- 0x3526, 0x0514, 0x1138, 0x2902, 0x414a,
- 0x3426, 0x0414, 0x1038, 0x2802, 0x404a,
- 0x3326, 0x0314, 0x0f38, 0x2702, 0x3f4a,
- 0x3328, 0x0316, 0x0f3a, 0x2704, 0x3f4c,
- 0x3428, 0x0416, 0x103a, 0x2804, 0x404c,
- 0x3528, 0x0516, 0x113a, 0x2904, 0x414c,
- 0x352a, 0x0518, 0x113c, 0x2906, 0x414e,
- 0x342a, 0x0418, 0x103c, 0x2806, 0x404e,
- 0x332a, 0x0318, 0x0f3c, 0x2706, 0x3f4e,
- 0x332c, 0x031a, 0x0f3e, 0x2708, 0x3f50,
- 0x342c, 0x041a, 0x103e, 0x2808, 0x4050,
- 0x352c, 0x051a, 0x113e, 0x2908, 0x4150,
- 0x352e, 0x051c, 0x1140, 0x290a, 0x4152,
- 0x342e, 0x041c, 0x1040, 0x280a, 0x4052,
- 0x332e, 0x031c, 0x0f40, 0x270a, 0x3f52,
- 0x3330, 0x031e, 0x0f42, 0x270c, 0x3f54,
- 0x3430, 0x041e, 0x1042, 0x280c, 0x4054,
- 0x3530, 0x051e, 0x1142, 0x290c, 0x4154,
- 0x3532, 0x0520, 0x1144, 0x290e, 0x4156,
- 0x3432, 0x0420, 0x1044, 0x280e, 0x4056,
- 0x3332, 0x0320, 0x0f44, 0x270e, 0x3f56,
- 0x3334, 0x0322, 0x0f46, 0x2710, 0x3f58,
- 0x3434, 0x0422, 0x1046, 0x2810, 0x4058,
- 0x3534, 0x0522, 0x1146, 0x2910, 0x4158,
- 0x3924, 0x0912, 0x1536, 0x2d00, 0x4548,
- 0x3a24, 0x0a12, 0x1636, 0x2e00, 0x4648,
- 0x3b24, 0x0b12, 0x1736, 0x2f00, 0x4748,
- 0x3b26, 0x0b14, 0x1738, 0x2f02, 0x474a,
- 0x3a26, 0x0a14, 0x1638, 0x2e02, 0x464a,
- 0x3926, 0x0914, 0x1538, 0x2d02, 0x454a,
- 0x3928, 0x0916, 0x153a, 0x2d04, 0x454c,
- 0x3a28, 0x0a16, 0x163a, 0x2e04, 0x464c,
- 0x3b28, 0x0b16, 0x173a, 0x2f04, 0x474c,
- 0x3b2a, 0x0b18, 0x173c, 0x2f06, 0x474e,
- 0x3a2a, 0x0a18, 0x163c, 0x2e06, 0x464e,
- 0x392a, 0x0918, 0x153c, 0x2d06, 0x454e,
- 0x392c, 0x091a, 0x153e, 0x2d08, 0x4550,
- 0x3a2c, 0x0a1a, 0x163e, 0x2e08, 0x4650,
- 0x3b2c, 0x0b1a, 0x173e, 0x2f08, 0x4750,
- 0x3b2e, 0x0b1c, 0x1740, 0x2f0a, 0x4752,
- 0x3a2e, 0x0a1c, 0x1640, 0x2e0a, 0x4652,
- 0x392e, 0x091c, 0x1540, 0x2d0a, 0x4552,
- 0x3930, 0x091e, 0x1542, 0x2d0c, 0x4554,
- 0x3a30, 0x0a1e, 0x1642, 0x2e0c, 0x4654,
- 0x3b30, 0x0b1e, 0x1742, 0x2f0c, 0x4754,
- 0x3b32, 0x0b20, 0x1744, 0x2f0e, 0x4756,
- 0x3a32, 0x0a20, 0x1644, 0x2e0e, 0x4656,
- 0x3932, 0x0920, 0x1544, 0x2d0e, 0x4556,
- 0x3934, 0x0922, 0x1546, 0x2d10, 0x4558,
- 0x3a34, 0x0a22, 0x1646, 0x2e10, 0x4658,
- 0x3b34, 0x0b22, 0x1746, 0x2f10, 0x4758,
- 0x3f24, 0x0f12, 0x1b36, 0x3300, 0x0348,
- 0x4024, 0x1012, 0x1c36, 0x3400, 0x0448,
- 0x4124, 0x1112, 0x1d36, 0x3500, 0x0548,
- 0x4126, 0x1114, 0x1d38, 0x3502, 0x054a,
- 0x4026, 0x1014, 0x1c38, 0x3402, 0x044a,
- 0x3f26, 0x0f14, 0x1b38, 0x3302, 0x034a,
- 0x3f28, 0x0f16, 0x1b3a, 0x3304, 0x034c,
- 0x4028, 0x1016, 0x1c3a, 0x3404, 0x044c,
- 0x4128, 0x1116, 0x1d3a, 0x3504, 0x054c,
- 0x412a, 0x1118, 0x1d3c, 0x3506, 0x054e,
- 0x402a, 0x1018, 0x1c3c, 0x3406, 0x044e,
- 0x3f2a, 0x0f18, 0x1b3c, 0x3306, 0x034e,
- 0x3f2c, 0x0f1a, 0x1b3e, 0x3308, 0x0350,
- 0x402c, 0x101a, 0x1c3e, 0x3408, 0x0450,
- 0x412c, 0x111a, 0x1d3e, 0x3508, 0x0550,
- 0x412e, 0x111c, 0x1d40, 0x350a, 0x0552,
- 0x402e, 0x101c, 0x1c40, 0x340a, 0x0452,
- 0x3f2e, 0x0f1c, 0x1b40, 0x330a, 0x0352,
- 0x3f30, 0x0f1e, 0x1b42, 0x330c, 0x0354,
- 0x4030, 0x101e, 0x1c42, 0x340c, 0x0454,
- 0x4130, 0x111e, 0x1d42, 0x350c, 0x0554,
- 0x4132, 0x1120, 0x1d44, 0x350e, 0x0556,
- 0x4032, 0x1020, 0x1c44, 0x340e, 0x0456,
- 0x3f32, 0x0f20, 0x1b44, 0x330e, 0x0356,
- 0x3f34, 0x0f22, 0x1b46, 0x3310, 0x0358,
- 0x4034, 0x1022, 0x1c46, 0x3410, 0x0458,
- 0x4134, 0x1122, 0x1d46, 0x3510, 0x0558,
- 0x4524, 0x1512, 0x2136, 0x3900, 0x0948,
- 0x4624, 0x1612, 0x2236, 0x3a00, 0x0a48,
- 0x4724, 0x1712, 0x2336, 0x3b00, 0x0b48,
- 0x4726, 0x1714, 0x2338, 0x3b02, 0x0b4a,
- 0x4626, 0x1614, 0x2238, 0x3a02, 0x0a4a,
- 0x4526, 0x1514, 0x2138, 0x3902, 0x094a,
- 0x4528, 0x1516, 0x213a, 0x3904, 0x094c,
- 0x4628, 0x1616, 0x223a, 0x3a04, 0x0a4c,
- 0x4728, 0x1716, 0x233a, 0x3b04, 0x0b4c,
- 0x472a, 0x1718, 0x233c, 0x3b06, 0x0b4e,
- 0x462a, 0x1618, 0x223c, 0x3a06, 0x0a4e,
- 0x452a, 0x1518, 0x213c, 0x3906, 0x094e,
- 0x452c, 0x151a, 0x213e, 0x3908, 0x0950,
- 0x462c, 0x161a, 0x223e, 0x3a08, 0x0a50,
- 0x472c, 0x171a, 0x233e, 0x3b08, 0x0b50,
- 0x472e, 0x171c, 0x2340, 0x3b0a, 0x0b52,
- 0x462e, 0x161c, 0x2240, 0x3a0a, 0x0a52,
- 0x452e, 0x151c, 0x2140, 0x390a, 0x0952,
- 0x4530, 0x151e, 0x2142, 0x390c, 0x0954,
- 0x4630, 0x161e, 0x2242, 0x3a0c, 0x0a54,
- 0x4730, 0x171e, 0x2342, 0x3b0c, 0x0b54,
- 0x4732, 0x1720, 0x2344, 0x3b0e, 0x0b56,
- 0x4632, 0x1620, 0x2244, 0x3a0e, 0x0a56,
- 0x4532, 0x1520, 0x2144, 0x390e, 0x0956,
- 0x4534, 0x1522, 0x2146, 0x3910, 0x0958,
- 0x4634, 0x1622, 0x2246, 0x3a10, 0x0a58,
- 0x4734, 0x1722, 0x2346, 0x3b10, 0x0b58,
- 0x0324, 0x1b12, 0x2736, 0x3f00, 0x0f48,
- 0x0424, 0x1c12, 0x2836, 0x4000, 0x1048,
- 0x0524, 0x1d12, 0x2936, 0x4100, 0x1148,
- 0x0526, 0x1d14, 0x2938, 0x4102, 0x114a,
- 0x0426, 0x1c14, 0x2838, 0x4002, 0x104a,
- 0x0326, 0x1b14, 0x2738, 0x3f02, 0x0f4a,
- 0x0328, 0x1b16, 0x273a, 0x3f04, 0x0f4c,
- 0x0428, 0x1c16, 0x283a, 0x4004, 0x104c,
- 0x0528, 0x1d16, 0x293a, 0x4104, 0x114c,
- 0x052a, 0x1d18, 0x293c, 0x4106, 0x114e,
- 0x042a, 0x1c18, 0x283c, 0x4006, 0x104e,
- 0x032a, 0x1b18, 0x273c, 0x3f06, 0x0f4e,
- 0x032c, 0x1b1a, 0x273e, 0x3f08, 0x0f50,
- 0x042c, 0x1c1a, 0x283e, 0x4008, 0x1050,
- 0x052c, 0x1d1a, 0x293e, 0x4108, 0x1150,
- 0x052e, 0x1d1c, 0x2940, 0x410a, 0x1152,
- 0x042e, 0x1c1c, 0x2840, 0x400a, 0x1052,
- 0x032e, 0x1b1c, 0x2740, 0x3f0a, 0x0f52,
- 0x0330, 0x1b1e, 0x2742, 0x3f0c, 0x0f54,
- 0x0430, 0x1c1e, 0x2842, 0x400c, 0x1054,
- 0x0530, 0x1d1e, 0x2942, 0x410c, 0x1154,
- 0x0532, 0x1d20, 0x2944, 0x410e, 0x1156,
- 0x0432, 0x1c20, 0x2844, 0x400e, 0x1056,
- 0x0332, 0x1b20, 0x2744, 0x3f0e, 0x0f56,
- 0x0334, 0x1b22, 0x2746, 0x3f10, 0x0f58,
- 0x0434, 0x1c22, 0x2846, 0x4010, 0x1058,
- 0x0534, 0x1d22, 0x2946, 0x4110, 0x1158,
- 0x0924, 0x2112, 0x2d36, 0x4500, 0x1548,
- 0x0a24, 0x2212, 0x2e36, 0x4600, 0x1648,
- 0x0b24, 0x2312, 0x2f36, 0x4700, 0x1748,
- 0x0b26, 0x2314, 0x2f38, 0x4702, 0x174a,
- 0x0a26, 0x2214, 0x2e38, 0x4602, 0x164a,
- 0x0926, 0x2114, 0x2d38, 0x4502, 0x154a,
- 0x0928, 0x2116, 0x2d3a, 0x4504, 0x154c,
- 0x0a28, 0x2216, 0x2e3a, 0x4604, 0x164c,
- 0x0b28, 0x2316, 0x2f3a, 0x4704, 0x174c,
- 0x0b2a, 0x2318, 0x2f3c, 0x4706, 0x174e,
- 0x0a2a, 0x2218, 0x2e3c, 0x4606, 0x164e,
- 0x092a, 0x2118, 0x2d3c, 0x4506, 0x154e,
- 0x092c, 0x211a, 0x2d3e, 0x4508, 0x1550,
- 0x0a2c, 0x221a, 0x2e3e, 0x4608, 0x1650,
- 0x0b2c, 0x231a, 0x2f3e, 0x4708, 0x1750,
- 0x0b2e, 0x231c, 0x2f40, 0x470a, 0x1752,
- 0x0a2e, 0x221c, 0x2e40, 0x460a, 0x1652,
- 0x092e, 0x211c, 0x2d40, 0x450a, 0x1552,
- 0x0930, 0x211e, 0x2d42, 0x450c, 0x1554,
- 0x0a30, 0x221e, 0x2e42, 0x460c, 0x1654,
- 0x0b30, 0x231e, 0x2f42, 0x470c, 0x1754,
- 0x0b32, 0x2320, 0x2f44, 0x470e, 0x1756,
- 0x0a32, 0x2220, 0x2e44, 0x460e, 0x1656,
- 0x0932, 0x2120, 0x2d44, 0x450e, 0x1556,
- 0x0934, 0x2122, 0x2d46, 0x4510, 0x1558,
- 0x0a34, 0x2222, 0x2e46, 0x4610, 0x1658,
- 0x0b34, 0x2322, 0x2f46, 0x4710, 0x1758,
-};
-
-static const uint16_t dv_place_1080i60[4*10*27*5] = {
- 0x2048, 0x5024, 0x686c, 0x0800, 0x3890,
- 0x3848, 0x6824, 0x086c, 0x2000, 0x5090,
- 0x5048, 0x0824, 0x206c, 0x3800, 0x6890,
- 0x6848, 0x2024, 0x386c, 0x5000, 0x0890,
- 0x0848, 0x3824, 0x506c, 0x6800, 0x2090,
- 0x204a, 0x5026, 0x686e, 0x0802, 0x3892,
- 0x384a, 0x6826, 0x086e, 0x2002, 0x5092,
- 0x504a, 0x0826, 0x206e, 0x3802, 0x6892,
- 0x684a, 0x2026, 0x386e, 0x5002, 0x0892,
- 0x084a, 0x3826, 0x506e, 0x6802, 0x2092,
- 0x204c, 0x5028, 0x6870, 0x0804, 0x3894,
- 0x384c, 0x6828, 0x0870, 0x2004, 0x5094,
- 0x504c, 0x0828, 0x2070, 0x3804, 0x6894,
- 0x684c, 0x2028, 0x3870, 0x5004, 0x0894,
- 0x084c, 0x3828, 0x5070, 0x6804, 0x2094,
- 0x204e, 0x502a, 0x6872, 0x0806, 0x3896,
- 0x384e, 0x682a, 0x0872, 0x2006, 0x5096,
- 0x504e, 0x082a, 0x2072, 0x3806, 0x6896,
- 0x684e, 0x202a, 0x3872, 0x5006, 0x0896,
- 0x084e, 0x382a, 0x5072, 0x6806, 0x2096,
- 0x2050, 0x502c, 0x6874, 0x0808, 0x3898,
- 0x3850, 0x682c, 0x0874, 0x2008, 0x5098,
- 0x5050, 0x082c, 0x2074, 0x3808, 0x6898,
- 0x6850, 0x202c, 0x3874, 0x5008, 0x0898,
- 0x0850, 0x382c, 0x5074, 0x6808, 0x2098,
- 0x2052, 0x502e, 0x6876, 0x080a, 0x389a,
- 0x3852, 0x682e, 0x0876, 0x200a, 0x509a,
- 0x5052, 0x082e, 0x2076, 0x380a, 0x689a,
- 0x6852, 0x202e, 0x3876, 0x500a, 0x089a,
- 0x0852, 0x382e, 0x5076, 0x680a, 0x209a,
- 0x2054, 0x5030, 0x6878, 0x080c, 0x389c,
- 0x3854, 0x6830, 0x0878, 0x200c, 0x509c,
- 0x5054, 0x0830, 0x2078, 0x380c, 0x689c,
- 0x6854, 0x2030, 0x3878, 0x500c, 0x089c,
- 0x0854, 0x3830, 0x5078, 0x680c, 0x209c,
- 0x2056, 0x5032, 0x687a, 0x080e, 0x389e,
- 0x3856, 0x6832, 0x087a, 0x200e, 0x509e,
- 0x5056, 0x0832, 0x207a, 0x380e, 0x689e,
- 0x6856, 0x2032, 0x387a, 0x500e, 0x089e,
- 0x0856, 0x3832, 0x507a, 0x680e, 0x209e,
- 0x2058, 0x5034, 0x687c, 0x0810, 0x0078,
- 0x3858, 0x6834, 0x087c, 0x2010, 0x8214,
- 0x5058, 0x0834, 0x207c, 0x3810, 0x8264,
- 0x6858, 0x2034, 0x387c, 0x5010, 0x0000,
- 0x0858, 0x3834, 0x507c, 0x6810, 0x003c,
- 0x2448, 0x5424, 0x6c6c, 0x0c00, 0x3c90,
- 0x3c48, 0x6c24, 0x0c6c, 0x2400, 0x5490,
- 0x5448, 0x0c24, 0x246c, 0x3c00, 0x6c90,
- 0x6c48, 0x2424, 0x3c6c, 0x5400, 0x0c90,
- 0x0c48, 0x3c24, 0x546c, 0x6c00, 0x2490,
- 0x244a, 0x5426, 0x6c6e, 0x0c02, 0x3c92,
- 0x3c4a, 0x6c26, 0x0c6e, 0x2402, 0x5492,
- 0x544a, 0x0c26, 0x246e, 0x3c02, 0x6c92,
- 0x6c4a, 0x2426, 0x3c6e, 0x5402, 0x0c92,
- 0x0c4a, 0x3c26, 0x546e, 0x6c02, 0x2492,
- 0x244c, 0x5428, 0x6c70, 0x0c04, 0x3c94,
- 0x3c4c, 0x6c28, 0x0c70, 0x2404, 0x5494,
- 0x544c, 0x0c28, 0x2470, 0x3c04, 0x6c94,
- 0x6c4c, 0x2428, 0x3c70, 0x5404, 0x0c94,
- 0x0c4c, 0x3c28, 0x5470, 0x6c04, 0x2494,
- 0x244e, 0x542a, 0x6c72, 0x0c06, 0x3c96,
- 0x3c4e, 0x6c2a, 0x0c72, 0x2406, 0x5496,
- 0x544e, 0x0c2a, 0x2472, 0x3c06, 0x6c96,
- 0x6c4e, 0x242a, 0x3c72, 0x5406, 0x0c96,
- 0x0c4e, 0x3c2a, 0x5472, 0x6c06, 0x2496,
- 0x2450, 0x542c, 0x6c74, 0x0c08, 0x3c98,
- 0x3c50, 0x6c2c, 0x0c74, 0x2408, 0x5498,
- 0x5450, 0x0c2c, 0x2474, 0x3c08, 0x6c98,
- 0x6c50, 0x242c, 0x3c74, 0x5408, 0x0c98,
- 0x0c50, 0x3c2c, 0x5474, 0x6c08, 0x2498,
- 0x2452, 0x542e, 0x6c76, 0x0c0a, 0x3c9a,
- 0x3c52, 0x6c2e, 0x0c76, 0x240a, 0x549a,
- 0x5452, 0x0c2e, 0x2476, 0x3c0a, 0x6c9a,
- 0x6c52, 0x242e, 0x3c76, 0x540a, 0x0c9a,
- 0x0c52, 0x3c2e, 0x5476, 0x6c0a, 0x249a,
- 0x2454, 0x5430, 0x6c78, 0x0c0c, 0x3c9c,
- 0x3c54, 0x6c30, 0x0c78, 0x240c, 0x549c,
- 0x5454, 0x0c30, 0x2478, 0x3c0c, 0x6c9c,
- 0x6c54, 0x2430, 0x3c78, 0x540c, 0x0c9c,
- 0x0c54, 0x3c30, 0x5478, 0x6c0c, 0x249c,
- 0x2456, 0x5432, 0x6c7a, 0x0c0e, 0x3c9e,
- 0x3c56, 0x6c32, 0x0c7a, 0x240e, 0x549e,
- 0x5456, 0x0c32, 0x247a, 0x3c0e, 0x6c9e,
- 0x6c56, 0x2432, 0x3c7a, 0x540e, 0x0c9e,
- 0x0c56, 0x3c32, 0x547a, 0x6c0e, 0x249e,
- 0x2458, 0x5434, 0x6c7c, 0x0c10, 0x0478,
- 0x3c58, 0x6c34, 0x0c7c, 0x2410, 0x8028,
- 0x5458, 0x0c34, 0x247c, 0x3c10, 0x8078,
- 0x6c58, 0x2434, 0x3c7c, 0x5410, 0x0400,
- 0x0c58, 0x3c34, 0x547c, 0x6c10, 0x043c,
- 0x2848, 0x5824, 0x706c, 0x1000, 0x4090,
- 0x4048, 0x7024, 0x106c, 0x2800, 0x5890,
- 0x5848, 0x1024, 0x286c, 0x4000, 0x7090,
- 0x7048, 0x2824, 0x406c, 0x5800, 0x1090,
- 0x1048, 0x4024, 0x586c, 0x7000, 0x2890,
- 0x284a, 0x5826, 0x706e, 0x1002, 0x4092,
- 0x404a, 0x7026, 0x106e, 0x2802, 0x5892,
- 0x584a, 0x1026, 0x286e, 0x4002, 0x7092,
- 0x704a, 0x2826, 0x406e, 0x5802, 0x1092,
- 0x104a, 0x4026, 0x586e, 0x7002, 0x2892,
- 0x284c, 0x5828, 0x7070, 0x1004, 0x4094,
- 0x404c, 0x7028, 0x1070, 0x2804, 0x5894,
- 0x584c, 0x1028, 0x2870, 0x4004, 0x7094,
- 0x704c, 0x2828, 0x4070, 0x5804, 0x1094,
- 0x104c, 0x4028, 0x5870, 0x7004, 0x2894,
- 0x284e, 0x582a, 0x7072, 0x1006, 0x4096,
- 0x404e, 0x702a, 0x1072, 0x2806, 0x5896,
- 0x584e, 0x102a, 0x2872, 0x4006, 0x7096,
- 0x704e, 0x282a, 0x4072, 0x5806, 0x1096,
- 0x104e, 0x402a, 0x5872, 0x7006, 0x2896,
- 0x2850, 0x582c, 0x7074, 0x1008, 0x4098,
- 0x4050, 0x702c, 0x1074, 0x2808, 0x5898,
- 0x5850, 0x102c, 0x2874, 0x4008, 0x7098,
- 0x7050, 0x282c, 0x4074, 0x5808, 0x1098,
- 0x1050, 0x402c, 0x5874, 0x7008, 0x2898,
- 0x2852, 0x582e, 0x7076, 0x100a, 0x409a,
- 0x4052, 0x702e, 0x1076, 0x280a, 0x589a,
- 0x5852, 0x102e, 0x2876, 0x400a, 0x709a,
- 0x7052, 0x282e, 0x4076, 0x580a, 0x109a,
- 0x1052, 0x402e, 0x5876, 0x700a, 0x289a,
- 0x2854, 0x5830, 0x7078, 0x100c, 0x409c,
- 0x4054, 0x7030, 0x1078, 0x280c, 0x589c,
- 0x5854, 0x1030, 0x2878, 0x400c, 0x709c,
- 0x7054, 0x2830, 0x4078, 0x580c, 0x109c,
- 0x1054, 0x4030, 0x5878, 0x700c, 0x289c,
- 0x2856, 0x5832, 0x707a, 0x100e, 0x409e,
- 0x4056, 0x7032, 0x107a, 0x280e, 0x589e,
- 0x5856, 0x1032, 0x287a, 0x400e, 0x709e,
- 0x7056, 0x2832, 0x407a, 0x580e, 0x109e,
- 0x1056, 0x4032, 0x587a, 0x700e, 0x289e,
- 0x2858, 0x5834, 0x707c, 0x1010, 0x008c,
- 0x4058, 0x7034, 0x107c, 0x2810, 0x8428,
- 0x5858, 0x1034, 0x287c, 0x4010, 0x8478,
- 0x7058, 0x2834, 0x407c, 0x5810, 0x0014,
- 0x1058, 0x4034, 0x587c, 0x7010, 0x0050,
- 0x2c48, 0x5c24, 0x746c, 0x1400, 0x4490,
- 0x4448, 0x7424, 0x146c, 0x2c00, 0x5c90,
- 0x5c48, 0x1424, 0x2c6c, 0x4400, 0x7490,
- 0x7448, 0x2c24, 0x446c, 0x5c00, 0x1490,
- 0x1448, 0x4424, 0x5c6c, 0x7400, 0x2c90,
- 0x2c4a, 0x5c26, 0x746e, 0x1402, 0x4492,
- 0x444a, 0x7426, 0x146e, 0x2c02, 0x5c92,
- 0x5c4a, 0x1426, 0x2c6e, 0x4402, 0x7492,
- 0x744a, 0x2c26, 0x446e, 0x5c02, 0x1492,
- 0x144a, 0x4426, 0x5c6e, 0x7402, 0x2c92,
- 0x2c4c, 0x5c28, 0x7470, 0x1404, 0x4494,
- 0x444c, 0x7428, 0x1470, 0x2c04, 0x5c94,
- 0x5c4c, 0x1428, 0x2c70, 0x4404, 0x7494,
- 0x744c, 0x2c28, 0x4470, 0x5c04, 0x1494,
- 0x144c, 0x4428, 0x5c70, 0x7404, 0x2c94,
- 0x2c4e, 0x5c2a, 0x7472, 0x1406, 0x4496,
- 0x444e, 0x742a, 0x1472, 0x2c06, 0x5c96,
- 0x5c4e, 0x142a, 0x2c72, 0x4406, 0x7496,
- 0x744e, 0x2c2a, 0x4472, 0x5c06, 0x1496,
- 0x144e, 0x442a, 0x5c72, 0x7406, 0x2c96,
- 0x2c50, 0x5c2c, 0x7474, 0x1408, 0x4498,
- 0x4450, 0x742c, 0x1474, 0x2c08, 0x5c98,
- 0x5c50, 0x142c, 0x2c74, 0x4408, 0x7498,
- 0x7450, 0x2c2c, 0x4474, 0x5c08, 0x1498,
- 0x1450, 0x442c, 0x5c74, 0x7408, 0x2c98,
- 0x2c52, 0x5c2e, 0x7476, 0x140a, 0x449a,
- 0x4452, 0x742e, 0x1476, 0x2c0a, 0x5c9a,
- 0x5c52, 0x142e, 0x2c76, 0x440a, 0x749a,
- 0x7452, 0x2c2e, 0x4476, 0x5c0a, 0x149a,
- 0x1452, 0x442e, 0x5c76, 0x740a, 0x2c9a,
- 0x2c54, 0x5c30, 0x7478, 0x140c, 0x449c,
- 0x4454, 0x7430, 0x1478, 0x2c0c, 0x5c9c,
- 0x5c54, 0x1430, 0x2c78, 0x440c, 0x749c,
- 0x7454, 0x2c30, 0x4478, 0x5c0c, 0x149c,
- 0x1454, 0x4430, 0x5c78, 0x740c, 0x2c9c,
- 0x2c56, 0x5c32, 0x747a, 0x140e, 0x449e,
- 0x4456, 0x7432, 0x147a, 0x2c0e, 0x5c9e,
- 0x5c56, 0x1432, 0x2c7a, 0x440e, 0x749e,
- 0x7456, 0x2c32, 0x447a, 0x5c0e, 0x149e,
- 0x1456, 0x4432, 0x5c7a, 0x740e, 0x2c9e,
- 0x2c58, 0x5c34, 0x747c, 0x1410, 0x048c,
- 0x4458, 0x7434, 0x147c, 0x2c10, 0x823c,
- 0x5c58, 0x1434, 0x2c7c, 0x4410, 0x828c,
- 0x7458, 0x2c34, 0x447c, 0x5c10, 0x0414,
- 0x1458, 0x4434, 0x5c7c, 0x7410, 0x0450,
- 0x3048, 0x6024, 0x786c, 0x1800, 0x4890,
- 0x4848, 0x7824, 0x186c, 0x3000, 0x6090,
- 0x6048, 0x1824, 0x306c, 0x4800, 0x7890,
- 0x7848, 0x3024, 0x486c, 0x6000, 0x1890,
- 0x1848, 0x4824, 0x606c, 0x7800, 0x3090,
- 0x304a, 0x6026, 0x786e, 0x1802, 0x4892,
- 0x484a, 0x7826, 0x186e, 0x3002, 0x6092,
- 0x604a, 0x1826, 0x306e, 0x4802, 0x7892,
- 0x784a, 0x3026, 0x486e, 0x6002, 0x1892,
- 0x184a, 0x4826, 0x606e, 0x7802, 0x3092,
- 0x304c, 0x6028, 0x7870, 0x1804, 0x4894,
- 0x484c, 0x7828, 0x1870, 0x3004, 0x6094,
- 0x604c, 0x1828, 0x3070, 0x4804, 0x7894,
- 0x784c, 0x3028, 0x4870, 0x6004, 0x1894,
- 0x184c, 0x4828, 0x6070, 0x7804, 0x3094,
- 0x304e, 0x602a, 0x7872, 0x1806, 0x4896,
- 0x484e, 0x782a, 0x1872, 0x3006, 0x6096,
- 0x604e, 0x182a, 0x3072, 0x4806, 0x7896,
- 0x784e, 0x302a, 0x4872, 0x6006, 0x1896,
- 0x184e, 0x482a, 0x6072, 0x7806, 0x3096,
- 0x3050, 0x602c, 0x7874, 0x1808, 0x4898,
- 0x4850, 0x782c, 0x1874, 0x3008, 0x6098,
- 0x6050, 0x182c, 0x3074, 0x4808, 0x7898,
- 0x7850, 0x302c, 0x4874, 0x6008, 0x1898,
- 0x1850, 0x482c, 0x6074, 0x7808, 0x3098,
- 0x3052, 0x602e, 0x7876, 0x180a, 0x489a,
- 0x4852, 0x782e, 0x1876, 0x300a, 0x609a,
- 0x6052, 0x182e, 0x3076, 0x480a, 0x789a,
- 0x7852, 0x302e, 0x4876, 0x600a, 0x189a,
- 0x1852, 0x482e, 0x6076, 0x780a, 0x309a,
- 0x3054, 0x6030, 0x7878, 0x180c, 0x489c,
- 0x4854, 0x7830, 0x1878, 0x300c, 0x609c,
- 0x6054, 0x1830, 0x3078, 0x480c, 0x789c,
- 0x7854, 0x3030, 0x4878, 0x600c, 0x189c,
- 0x1854, 0x4830, 0x6078, 0x780c, 0x309c,
- 0x3056, 0x6032, 0x787a, 0x180e, 0x489e,
- 0x4856, 0x7832, 0x187a, 0x300e, 0x609e,
- 0x6056, 0x1832, 0x307a, 0x480e, 0x789e,
- 0x7856, 0x3032, 0x487a, 0x600e, 0x189e,
- 0x1856, 0x4832, 0x607a, 0x780e, 0x309e,
- 0x3058, 0x6034, 0x787c, 0x1810, 0x8000,
- 0x4858, 0x7834, 0x187c, 0x3010, 0x8050,
- 0x6058, 0x1834, 0x307c, 0x4810, 0x8600,
- 0x7858, 0x3034, 0x487c, 0x6010, 0x0028,
- 0x1858, 0x4834, 0x607c, 0x7810, 0x0064,
- 0x3448, 0x6424, 0x7c6c, 0x1c00, 0x4c90,
- 0x4c48, 0x7c24, 0x1c6c, 0x3400, 0x6490,
- 0x6448, 0x1c24, 0x346c, 0x4c00, 0x7c90,
- 0x7c48, 0x3424, 0x4c6c, 0x6400, 0x1c90,
- 0x1c48, 0x4c24, 0x646c, 0x7c00, 0x3490,
- 0x344a, 0x6426, 0x7c6e, 0x1c02, 0x4c92,
- 0x4c4a, 0x7c26, 0x1c6e, 0x3402, 0x6492,
- 0x644a, 0x1c26, 0x346e, 0x4c02, 0x7c92,
- 0x7c4a, 0x3426, 0x4c6e, 0x6402, 0x1c92,
- 0x1c4a, 0x4c26, 0x646e, 0x7c02, 0x3492,
- 0x344c, 0x6428, 0x7c70, 0x1c04, 0x4c94,
- 0x4c4c, 0x7c28, 0x1c70, 0x3404, 0x6494,
- 0x644c, 0x1c28, 0x3470, 0x4c04, 0x7c94,
- 0x7c4c, 0x3428, 0x4c70, 0x6404, 0x1c94,
- 0x1c4c, 0x4c28, 0x6470, 0x7c04, 0x3494,
- 0x344e, 0x642a, 0x7c72, 0x1c06, 0x4c96,
- 0x4c4e, 0x7c2a, 0x1c72, 0x3406, 0x6496,
- 0x644e, 0x1c2a, 0x3472, 0x4c06, 0x7c96,
- 0x7c4e, 0x342a, 0x4c72, 0x6406, 0x1c96,
- 0x1c4e, 0x4c2a, 0x6472, 0x7c06, 0x3496,
- 0x3450, 0x642c, 0x7c74, 0x1c08, 0x4c98,
- 0x4c50, 0x7c2c, 0x1c74, 0x3408, 0x6498,
- 0x6450, 0x1c2c, 0x3474, 0x4c08, 0x7c98,
- 0x7c50, 0x342c, 0x4c74, 0x6408, 0x1c98,
- 0x1c50, 0x4c2c, 0x6474, 0x7c08, 0x3498,
- 0x3452, 0x642e, 0x7c76, 0x1c0a, 0x4c9a,
- 0x4c52, 0x7c2e, 0x1c76, 0x340a, 0x649a,
- 0x6452, 0x1c2e, 0x3476, 0x4c0a, 0x7c9a,
- 0x7c52, 0x342e, 0x4c76, 0x640a, 0x1c9a,
- 0x1c52, 0x4c2e, 0x6476, 0x7c0a, 0x349a,
- 0x3454, 0x6430, 0x7c78, 0x1c0c, 0x4c9c,
- 0x4c54, 0x7c30, 0x1c78, 0x340c, 0x649c,
- 0x6454, 0x1c30, 0x3478, 0x4c0c, 0x7c9c,
- 0x7c54, 0x3430, 0x4c78, 0x640c, 0x1c9c,
- 0x1c54, 0x4c30, 0x6478, 0x7c0c, 0x349c,
- 0x3456, 0x6432, 0x7c7a, 0x1c0e, 0x4c9e,
- 0x4c56, 0x7c32, 0x1c7a, 0x340e, 0x649e,
- 0x6456, 0x1c32, 0x347a, 0x4c0e, 0x7c9e,
- 0x7c56, 0x3432, 0x4c7a, 0x640e, 0x1c9e,
- 0x1c56, 0x4c32, 0x647a, 0x7c0e, 0x349e,
- 0x3458, 0x6434, 0x7c7c, 0x1c10, 0x8400,
- 0x4c58, 0x7c34, 0x1c7c, 0x3410, 0x8450,
- 0x6458, 0x1c34, 0x347c, 0x4c10, 0x8650,
- 0x7c58, 0x3434, 0x4c7c, 0x6410, 0x0428,
- 0x1c58, 0x4c34, 0x647c, 0x7c10, 0x0464,
- 0x505a, 0x0836, 0x207e, 0x3812, 0x8266,
- 0x685a, 0x2036, 0x387e, 0x5012, 0x0002,
- 0x085a, 0x3836, 0x507e, 0x6812, 0x003e,
- 0x205a, 0x5036, 0x687e, 0x0812, 0x007a,
- 0x385a, 0x6836, 0x087e, 0x2012, 0x8216,
- 0x505c, 0x0838, 0x2080, 0x3814, 0x8268,
- 0x685c, 0x2038, 0x3880, 0x5014, 0x0004,
- 0x085c, 0x3838, 0x5080, 0x6814, 0x0040,
- 0x205c, 0x5038, 0x6880, 0x0814, 0x007c,
- 0x385c, 0x6838, 0x0880, 0x2014, 0x8218,
- 0x505e, 0x083a, 0x2082, 0x3816, 0x826a,
- 0x685e, 0x203a, 0x3882, 0x5016, 0x0006,
- 0x085e, 0x383a, 0x5082, 0x6816, 0x0042,
- 0x205e, 0x503a, 0x6882, 0x0816, 0x007e,
- 0x385e, 0x683a, 0x0882, 0x2016, 0x821a,
- 0x5060, 0x083c, 0x2084, 0x3818, 0x826c,
- 0x6860, 0x203c, 0x3884, 0x5018, 0x0008,
- 0x0860, 0x383c, 0x5084, 0x6818, 0x0044,
- 0x2060, 0x503c, 0x6884, 0x0818, 0x0080,
- 0x3860, 0x683c, 0x0884, 0x2018, 0x821c,
- 0x5062, 0x083e, 0x2086, 0x381a, 0x826e,
- 0x6862, 0x203e, 0x3886, 0x501a, 0x000a,
- 0x0862, 0x383e, 0x5086, 0x681a, 0x0046,
- 0x2062, 0x503e, 0x6886, 0x081a, 0x0082,
- 0x3862, 0x683e, 0x0886, 0x201a, 0x821e,
- 0x5064, 0x0840, 0x2088, 0x381c, 0x8270,
- 0x6864, 0x2040, 0x3888, 0x501c, 0x000c,
- 0x0864, 0x3840, 0x5088, 0x681c, 0x0048,
- 0x2064, 0x5040, 0x6888, 0x081c, 0x0084,
- 0x3864, 0x6840, 0x0888, 0x201c, 0x8220,
- 0x5066, 0x0842, 0x208a, 0x381e, 0x8272,
- 0x6866, 0x2042, 0x388a, 0x501e, 0x000e,
- 0x0866, 0x3842, 0x508a, 0x681e, 0x004a,
- 0x2066, 0x5042, 0x688a, 0x081e, 0x0086,
- 0x3866, 0x6842, 0x088a, 0x201e, 0x8222,
- 0x5068, 0x0844, 0x208c, 0x3820, 0x8274,
- 0x6868, 0x2044, 0x388c, 0x5020, 0x0010,
- 0x0868, 0x3844, 0x508c, 0x6820, 0x004c,
- 0x2068, 0x5044, 0x688c, 0x0820, 0x0088,
- 0x3868, 0x6844, 0x088c, 0x2020, 0x8224,
- 0x506a, 0x0846, 0x208e, 0x3822, 0x8276,
- 0x686a, 0x2046, 0x388e, 0x5022, 0x0012,
- 0x086a, 0x3846, 0x508e, 0x6822, 0x004e,
- 0x206a, 0x5046, 0x688e, 0x0822, 0x008a,
- 0x386a, 0x6846, 0x088e, 0x2022, 0x8226,
- 0x545a, 0x0c36, 0x247e, 0x3c12, 0x807a,
- 0x6c5a, 0x2436, 0x3c7e, 0x5412, 0x0402,
- 0x0c5a, 0x3c36, 0x547e, 0x6c12, 0x043e,
- 0x245a, 0x5436, 0x6c7e, 0x0c12, 0x047a,
- 0x3c5a, 0x6c36, 0x0c7e, 0x2412, 0x802a,
- 0x545c, 0x0c38, 0x2480, 0x3c14, 0x807c,
- 0x6c5c, 0x2438, 0x3c80, 0x5414, 0x0404,
- 0x0c5c, 0x3c38, 0x5480, 0x6c14, 0x0440,
- 0x245c, 0x5438, 0x6c80, 0x0c14, 0x047c,
- 0x3c5c, 0x6c38, 0x0c80, 0x2414, 0x802c,
- 0x545e, 0x0c3a, 0x2482, 0x3c16, 0x807e,
- 0x6c5e, 0x243a, 0x3c82, 0x5416, 0x0406,
- 0x0c5e, 0x3c3a, 0x5482, 0x6c16, 0x0442,
- 0x245e, 0x543a, 0x6c82, 0x0c16, 0x047e,
- 0x3c5e, 0x6c3a, 0x0c82, 0x2416, 0x802e,
- 0x5460, 0x0c3c, 0x2484, 0x3c18, 0x8080,
- 0x6c60, 0x243c, 0x3c84, 0x5418, 0x0408,
- 0x0c60, 0x3c3c, 0x5484, 0x6c18, 0x0444,
- 0x2460, 0x543c, 0x6c84, 0x0c18, 0x0480,
- 0x3c60, 0x6c3c, 0x0c84, 0x2418, 0x8030,
- 0x5462, 0x0c3e, 0x2486, 0x3c1a, 0x8082,
- 0x6c62, 0x243e, 0x3c86, 0x541a, 0x040a,
- 0x0c62, 0x3c3e, 0x5486, 0x6c1a, 0x0446,
- 0x2462, 0x543e, 0x6c86, 0x0c1a, 0x0482,
- 0x3c62, 0x6c3e, 0x0c86, 0x241a, 0x8032,
- 0x5464, 0x0c40, 0x2488, 0x3c1c, 0x8084,
- 0x6c64, 0x2440, 0x3c88, 0x541c, 0x040c,
- 0x0c64, 0x3c40, 0x5488, 0x6c1c, 0x0448,
- 0x2464, 0x5440, 0x6c88, 0x0c1c, 0x0484,
- 0x3c64, 0x6c40, 0x0c88, 0x241c, 0x8034,
- 0x5466, 0x0c42, 0x248a, 0x3c1e, 0x8086,
- 0x6c66, 0x2442, 0x3c8a, 0x541e, 0x040e,
- 0x0c66, 0x3c42, 0x548a, 0x6c1e, 0x044a,
- 0x2466, 0x5442, 0x6c8a, 0x0c1e, 0x0486,
- 0x3c66, 0x6c42, 0x0c8a, 0x241e, 0x8036,
- 0x5468, 0x0c44, 0x248c, 0x3c20, 0x8088,
- 0x6c68, 0x2444, 0x3c8c, 0x5420, 0x0410,
- 0x0c68, 0x3c44, 0x548c, 0x6c20, 0x044c,
- 0x2468, 0x5444, 0x6c8c, 0x0c20, 0x0488,
- 0x3c68, 0x6c44, 0x0c8c, 0x2420, 0x8038,
- 0x546a, 0x0c46, 0x248e, 0x3c22, 0x808a,
- 0x6c6a, 0x2446, 0x3c8e, 0x5422, 0x0412,
- 0x0c6a, 0x3c46, 0x548e, 0x6c22, 0x044e,
- 0x246a, 0x5446, 0x6c8e, 0x0c22, 0x048a,
- 0x3c6a, 0x6c46, 0x0c8e, 0x2422, 0x803a,
- 0x585a, 0x1036, 0x287e, 0x4012, 0x847a,
- 0x705a, 0x2836, 0x407e, 0x5812, 0x0016,
- 0x105a, 0x4036, 0x587e, 0x7012, 0x0052,
- 0x285a, 0x5836, 0x707e, 0x1012, 0x008e,
- 0x405a, 0x7036, 0x107e, 0x2812, 0x842a,
- 0x585c, 0x1038, 0x2880, 0x4014, 0x847c,
- 0x705c, 0x2838, 0x4080, 0x5814, 0x0018,
- 0x105c, 0x4038, 0x5880, 0x7014, 0x0054,
- 0x285c, 0x5838, 0x7080, 0x1014, 0x0090,
- 0x405c, 0x7038, 0x1080, 0x2814, 0x842c,
- 0x585e, 0x103a, 0x2882, 0x4016, 0x847e,
- 0x705e, 0x283a, 0x4082, 0x5816, 0x001a,
- 0x105e, 0x403a, 0x5882, 0x7016, 0x0056,
- 0x285e, 0x583a, 0x7082, 0x1016, 0x0092,
- 0x405e, 0x703a, 0x1082, 0x2816, 0x842e,
- 0x5860, 0x103c, 0x2884, 0x4018, 0x8480,
- 0x7060, 0x283c, 0x4084, 0x5818, 0x001c,
- 0x1060, 0x403c, 0x5884, 0x7018, 0x0058,
- 0x2860, 0x583c, 0x7084, 0x1018, 0x0094,
- 0x4060, 0x703c, 0x1084, 0x2818, 0x8430,
- 0x5862, 0x103e, 0x2886, 0x401a, 0x8482,
- 0x7062, 0x283e, 0x4086, 0x581a, 0x001e,
- 0x1062, 0x403e, 0x5886, 0x701a, 0x005a,
- 0x2862, 0x583e, 0x7086, 0x101a, 0x0096,
- 0x4062, 0x703e, 0x1086, 0x281a, 0x8432,
- 0x5864, 0x1040, 0x2888, 0x401c, 0x8484,
- 0x7064, 0x2840, 0x4088, 0x581c, 0x0020,
- 0x1064, 0x4040, 0x5888, 0x701c, 0x005c,
- 0x2864, 0x5840, 0x7088, 0x101c, 0x0098,
- 0x4064, 0x7040, 0x1088, 0x281c, 0x8434,
- 0x5866, 0x1042, 0x288a, 0x401e, 0x8486,
- 0x7066, 0x2842, 0x408a, 0x581e, 0x0022,
- 0x1066, 0x4042, 0x588a, 0x701e, 0x005e,
- 0x2866, 0x5842, 0x708a, 0x101e, 0x009a,
- 0x4066, 0x7042, 0x108a, 0x281e, 0x8436,
- 0x5868, 0x1044, 0x288c, 0x4020, 0x8488,
- 0x7068, 0x2844, 0x408c, 0x5820, 0x0024,
- 0x1068, 0x4044, 0x588c, 0x7020, 0x0060,
- 0x2868, 0x5844, 0x708c, 0x1020, 0x009c,
- 0x4068, 0x7044, 0x108c, 0x2820, 0x8438,
- 0x586a, 0x1046, 0x288e, 0x4022, 0x848a,
- 0x706a, 0x2846, 0x408e, 0x5822, 0x0026,
- 0x106a, 0x4046, 0x588e, 0x7022, 0x0062,
- 0x286a, 0x5846, 0x708e, 0x1022, 0x009e,
- 0x406a, 0x7046, 0x108e, 0x2822, 0x843a,
- 0x5c5a, 0x1436, 0x2c7e, 0x4412, 0x828e,
- 0x745a, 0x2c36, 0x447e, 0x5c12, 0x0416,
- 0x145a, 0x4436, 0x5c7e, 0x7412, 0x0452,
- 0x2c5a, 0x5c36, 0x747e, 0x1412, 0x048e,
- 0x445a, 0x7436, 0x147e, 0x2c12, 0x823e,
- 0x5c5c, 0x1438, 0x2c80, 0x4414, 0x8290,
- 0x745c, 0x2c38, 0x4480, 0x5c14, 0x0418,
- 0x145c, 0x4438, 0x5c80, 0x7414, 0x0454,
- 0x2c5c, 0x5c38, 0x7480, 0x1414, 0x0490,
- 0x445c, 0x7438, 0x1480, 0x2c14, 0x8240,
- 0x5c5e, 0x143a, 0x2c82, 0x4416, 0x8292,
- 0x745e, 0x2c3a, 0x4482, 0x5c16, 0x041a,
- 0x145e, 0x443a, 0x5c82, 0x7416, 0x0456,
- 0x2c5e, 0x5c3a, 0x7482, 0x1416, 0x0492,
- 0x445e, 0x743a, 0x1482, 0x2c16, 0x8242,
- 0x5c60, 0x143c, 0x2c84, 0x4418, 0x8294,
- 0x7460, 0x2c3c, 0x4484, 0x5c18, 0x041c,
- 0x1460, 0x443c, 0x5c84, 0x7418, 0x0458,
- 0x2c60, 0x5c3c, 0x7484, 0x1418, 0x0494,
- 0x4460, 0x743c, 0x1484, 0x2c18, 0x8244,
- 0x5c62, 0x143e, 0x2c86, 0x441a, 0x8296,
- 0x7462, 0x2c3e, 0x4486, 0x5c1a, 0x041e,
- 0x1462, 0x443e, 0x5c86, 0x741a, 0x045a,
- 0x2c62, 0x5c3e, 0x7486, 0x141a, 0x0496,
- 0x4462, 0x743e, 0x1486, 0x2c1a, 0x8246,
- 0x5c64, 0x1440, 0x2c88, 0x441c, 0x8298,
- 0x7464, 0x2c40, 0x4488, 0x5c1c, 0x0420,
- 0x1464, 0x4440, 0x5c88, 0x741c, 0x045c,
- 0x2c64, 0x5c40, 0x7488, 0x141c, 0x0498,
- 0x4464, 0x7440, 0x1488, 0x2c1c, 0x8248,
- 0x5c66, 0x1442, 0x2c8a, 0x441e, 0x829a,
- 0x7466, 0x2c42, 0x448a, 0x5c1e, 0x0422,
- 0x1466, 0x4442, 0x5c8a, 0x741e, 0x045e,
- 0x2c66, 0x5c42, 0x748a, 0x141e, 0x049a,
- 0x4466, 0x7442, 0x148a, 0x2c1e, 0x824a,
- 0x5c68, 0x1444, 0x2c8c, 0x4420, 0x829c,
- 0x7468, 0x2c44, 0x448c, 0x5c20, 0x0424,
- 0x1468, 0x4444, 0x5c8c, 0x7420, 0x0460,
- 0x2c68, 0x5c44, 0x748c, 0x1420, 0x049c,
- 0x4468, 0x7444, 0x148c, 0x2c20, 0x824c,
- 0x5c6a, 0x1446, 0x2c8e, 0x4422, 0x829e,
- 0x746a, 0x2c46, 0x448e, 0x5c22, 0x0426,
- 0x146a, 0x4446, 0x5c8e, 0x7422, 0x0462,
- 0x2c6a, 0x5c46, 0x748e, 0x1422, 0x049e,
- 0x446a, 0x7446, 0x148e, 0x2c22, 0x824e,
- 0x605a, 0x1836, 0x307e, 0x4812, 0x8604,
- 0x785a, 0x3036, 0x487e, 0x6012, 0x002a,
- 0x185a, 0x4836, 0x607e, 0x7812, 0x0066,
- 0x305a, 0x6036, 0x787e, 0x1812, 0x8002,
- 0x485a, 0x7836, 0x187e, 0x3012, 0x8052,
- 0x605c, 0x1838, 0x3080, 0x4814, 0x8608,
- 0x785c, 0x3038, 0x4880, 0x6014, 0x002c,
- 0x185c, 0x4838, 0x6080, 0x7814, 0x0068,
- 0x305c, 0x6038, 0x7880, 0x1814, 0x8004,
- 0x485c, 0x7838, 0x1880, 0x3014, 0x8054,
- 0x605e, 0x183a, 0x3082, 0x4816, 0x860c,
- 0x785e, 0x303a, 0x4882, 0x6016, 0x002e,
- 0x185e, 0x483a, 0x6082, 0x7816, 0x006a,
- 0x305e, 0x603a, 0x7882, 0x1816, 0x8006,
- 0x485e, 0x783a, 0x1882, 0x3016, 0x8056,
- 0x6060, 0x183c, 0x3084, 0x4818, 0x8610,
- 0x7860, 0x303c, 0x4884, 0x6018, 0x0030,
- 0x1860, 0x483c, 0x6084, 0x7818, 0x006c,
- 0x3060, 0x603c, 0x7884, 0x1818, 0x8008,
- 0x4860, 0x783c, 0x1884, 0x3018, 0x8058,
- 0x6062, 0x183e, 0x3086, 0x481a, 0x8614,
- 0x7862, 0x303e, 0x4886, 0x601a, 0x0032,
- 0x1862, 0x483e, 0x6086, 0x781a, 0x006e,
- 0x3062, 0x603e, 0x7886, 0x181a, 0x800a,
- 0x4862, 0x783e, 0x1886, 0x301a, 0x805a,
- 0x6064, 0x1840, 0x3088, 0x481c, 0x8618,
- 0x7864, 0x3040, 0x4888, 0x601c, 0x0034,
- 0x1864, 0x4840, 0x6088, 0x781c, 0x0070,
- 0x3064, 0x6040, 0x7888, 0x181c, 0x800c,
- 0x4864, 0x7840, 0x1888, 0x301c, 0x805c,
- 0x6066, 0x1842, 0x308a, 0x481e, 0x861c,
- 0x7866, 0x3042, 0x488a, 0x601e, 0x0036,
- 0x1866, 0x4842, 0x608a, 0x781e, 0x0072,
- 0x3066, 0x6042, 0x788a, 0x181e, 0x800e,
- 0x4866, 0x7842, 0x188a, 0x301e, 0x805e,
- 0x6068, 0x1844, 0x308c, 0x4820, 0x8620,
- 0x7868, 0x3044, 0x488c, 0x6020, 0x0038,
- 0x1868, 0x4844, 0x608c, 0x7820, 0x0074,
- 0x3068, 0x6044, 0x788c, 0x1820, 0x8010,
- 0x4868, 0x7844, 0x188c, 0x3020, 0x8060,
- 0x606a, 0x1846, 0x308e, 0x4822, 0x8624,
- 0x786a, 0x3046, 0x488e, 0x6022, 0x003a,
- 0x186a, 0x4846, 0x608e, 0x7822, 0x0076,
- 0x306a, 0x6046, 0x788e, 0x1822, 0x8012,
- 0x486a, 0x7846, 0x188e, 0x3022, 0x8062,
- 0x645a, 0x1c36, 0x347e, 0x4c12, 0x8654,
- 0x7c5a, 0x3436, 0x4c7e, 0x6412, 0x042a,
- 0x1c5a, 0x4c36, 0x647e, 0x7c12, 0x0466,
- 0x345a, 0x6436, 0x7c7e, 0x1c12, 0x8402,
- 0x4c5a, 0x7c36, 0x1c7e, 0x3412, 0x8452,
- 0x645c, 0x1c38, 0x3480, 0x4c14, 0x8658,
- 0x7c5c, 0x3438, 0x4c80, 0x6414, 0x042c,
- 0x1c5c, 0x4c38, 0x6480, 0x7c14, 0x0468,
- 0x345c, 0x6438, 0x7c80, 0x1c14, 0x8404,
- 0x4c5c, 0x7c38, 0x1c80, 0x3414, 0x8454,
- 0x645e, 0x1c3a, 0x3482, 0x4c16, 0x865c,
- 0x7c5e, 0x343a, 0x4c82, 0x6416, 0x042e,
- 0x1c5e, 0x4c3a, 0x6482, 0x7c16, 0x046a,
- 0x345e, 0x643a, 0x7c82, 0x1c16, 0x8406,
- 0x4c5e, 0x7c3a, 0x1c82, 0x3416, 0x8456,
- 0x6460, 0x1c3c, 0x3484, 0x4c18, 0x8660,
- 0x7c60, 0x343c, 0x4c84, 0x6418, 0x0430,
- 0x1c60, 0x4c3c, 0x6484, 0x7c18, 0x046c,
- 0x3460, 0x643c, 0x7c84, 0x1c18, 0x8408,
- 0x4c60, 0x7c3c, 0x1c84, 0x3418, 0x8458,
- 0x6462, 0x1c3e, 0x3486, 0x4c1a, 0x8664,
- 0x7c62, 0x343e, 0x4c86, 0x641a, 0x0432,
- 0x1c62, 0x4c3e, 0x6486, 0x7c1a, 0x046e,
- 0x3462, 0x643e, 0x7c86, 0x1c1a, 0x840a,
- 0x4c62, 0x7c3e, 0x1c86, 0x341a, 0x845a,
- 0x6464, 0x1c40, 0x3488, 0x4c1c, 0x8668,
- 0x7c64, 0x3440, 0x4c88, 0x641c, 0x0434,
- 0x1c64, 0x4c40, 0x6488, 0x7c1c, 0x0470,
- 0x3464, 0x6440, 0x7c88, 0x1c1c, 0x840c,
- 0x4c64, 0x7c40, 0x1c88, 0x341c, 0x845c,
- 0x6466, 0x1c42, 0x348a, 0x4c1e, 0x866c,
- 0x7c66, 0x3442, 0x4c8a, 0x641e, 0x0436,
- 0x1c66, 0x4c42, 0x648a, 0x7c1e, 0x0472,
- 0x3466, 0x6442, 0x7c8a, 0x1c1e, 0x840e,
- 0x4c66, 0x7c42, 0x1c8a, 0x341e, 0x845e,
- 0x6468, 0x1c44, 0x348c, 0x4c20, 0x8670,
- 0x7c68, 0x3444, 0x4c8c, 0x6420, 0x0438,
- 0x1c68, 0x4c44, 0x648c, 0x7c20, 0x0474,
- 0x3468, 0x6444, 0x7c8c, 0x1c20, 0x8410,
- 0x4c68, 0x7c44, 0x1c8c, 0x3420, 0x8460,
- 0x646a, 0x1c46, 0x348e, 0x4c22, 0x8674,
- 0x7c6a, 0x3446, 0x4c8e, 0x6422, 0x043a,
- 0x1c6a, 0x4c46, 0x648e, 0x7c22, 0x0476,
- 0x346a, 0x6446, 0x7c8e, 0x1c22, 0x8412,
- 0x4c6a, 0x7c46, 0x1c8e, 0x3422, 0x8462,
- 0x0a48, 0x3a24, 0x526c, 0x6a00, 0x2290,
- 0x2248, 0x5224, 0x6a6c, 0x0a00, 0x3a90,
- 0x3a48, 0x6a24, 0x0a6c, 0x2200, 0x5290,
- 0x5248, 0x0a24, 0x226c, 0x3a00, 0x6a90,
- 0x6a48, 0x2224, 0x3a6c, 0x5200, 0x0a90,
- 0x0a4a, 0x3a26, 0x526e, 0x6a02, 0x2292,
- 0x224a, 0x5226, 0x6a6e, 0x0a02, 0x3a92,
- 0x3a4a, 0x6a26, 0x0a6e, 0x2202, 0x5292,
- 0x524a, 0x0a26, 0x226e, 0x3a02, 0x6a92,
- 0x6a4a, 0x2226, 0x3a6e, 0x5202, 0x0a92,
- 0x0a4c, 0x3a28, 0x5270, 0x6a04, 0x2294,
- 0x224c, 0x5228, 0x6a70, 0x0a04, 0x3a94,
- 0x3a4c, 0x6a28, 0x0a70, 0x2204, 0x5294,
- 0x524c, 0x0a28, 0x2270, 0x3a04, 0x6a94,
- 0x6a4c, 0x2228, 0x3a70, 0x5204, 0x0a94,
- 0x0a4e, 0x3a2a, 0x5272, 0x6a06, 0x2296,
- 0x224e, 0x522a, 0x6a72, 0x0a06, 0x3a96,
- 0x3a4e, 0x6a2a, 0x0a72, 0x2206, 0x5296,
- 0x524e, 0x0a2a, 0x2272, 0x3a06, 0x6a96,
- 0x6a4e, 0x222a, 0x3a72, 0x5206, 0x0a96,
- 0x0a50, 0x3a2c, 0x5274, 0x6a08, 0x2298,
- 0x2250, 0x522c, 0x6a74, 0x0a08, 0x3a98,
- 0x3a50, 0x6a2c, 0x0a74, 0x2208, 0x5298,
- 0x5250, 0x0a2c, 0x2274, 0x3a08, 0x6a98,
- 0x6a50, 0x222c, 0x3a74, 0x5208, 0x0a98,
- 0x0a52, 0x3a2e, 0x5276, 0x6a0a, 0x229a,
- 0x2252, 0x522e, 0x6a76, 0x0a0a, 0x3a9a,
- 0x3a52, 0x6a2e, 0x0a76, 0x220a, 0x529a,
- 0x5252, 0x0a2e, 0x2276, 0x3a0a, 0x6a9a,
- 0x6a52, 0x222e, 0x3a76, 0x520a, 0x0a9a,
- 0x0a54, 0x3a30, 0x5278, 0x6a0c, 0x229c,
- 0x2254, 0x5230, 0x6a78, 0x0a0c, 0x3a9c,
- 0x3a54, 0x6a30, 0x0a78, 0x220c, 0x529c,
- 0x5254, 0x0a30, 0x2278, 0x3a0c, 0x6a9c,
- 0x6a54, 0x2230, 0x3a78, 0x520c, 0x0a9c,
- 0x0a56, 0x3a32, 0x527a, 0x6a0e, 0x229e,
- 0x2256, 0x5232, 0x6a7a, 0x0a0e, 0x3a9e,
- 0x3a56, 0x6a32, 0x0a7a, 0x220e, 0x529e,
- 0x5256, 0x0a32, 0x227a, 0x3a0e, 0x6a9e,
- 0x6a56, 0x2232, 0x3a7a, 0x520e, 0x0a9e,
- 0x0a58, 0x3a34, 0x527c, 0x6a10, 0x023c,
- 0x2258, 0x5234, 0x6a7c, 0x0a10, 0x0278,
- 0x3a58, 0x6a34, 0x0a7c, 0x2210, 0x8414,
- 0x5258, 0x0a34, 0x227c, 0x3a10, 0x8464,
- 0x6a58, 0x2234, 0x3a7c, 0x5210, 0x0200,
- 0x0e48, 0x3e24, 0x566c, 0x6e00, 0x2690,
- 0x2648, 0x5624, 0x6e6c, 0x0e00, 0x3e90,
- 0x3e48, 0x6e24, 0x0e6c, 0x2600, 0x5690,
- 0x5648, 0x0e24, 0x266c, 0x3e00, 0x6e90,
- 0x6e48, 0x2624, 0x3e6c, 0x5600, 0x0e90,
- 0x0e4a, 0x3e26, 0x566e, 0x6e02, 0x2692,
- 0x264a, 0x5626, 0x6e6e, 0x0e02, 0x3e92,
- 0x3e4a, 0x6e26, 0x0e6e, 0x2602, 0x5692,
- 0x564a, 0x0e26, 0x266e, 0x3e02, 0x6e92,
- 0x6e4a, 0x2626, 0x3e6e, 0x5602, 0x0e92,
- 0x0e4c, 0x3e28, 0x5670, 0x6e04, 0x2694,
- 0x264c, 0x5628, 0x6e70, 0x0e04, 0x3e94,
- 0x3e4c, 0x6e28, 0x0e70, 0x2604, 0x5694,
- 0x564c, 0x0e28, 0x2670, 0x3e04, 0x6e94,
- 0x6e4c, 0x2628, 0x3e70, 0x5604, 0x0e94,
- 0x0e4e, 0x3e2a, 0x5672, 0x6e06, 0x2696,
- 0x264e, 0x562a, 0x6e72, 0x0e06, 0x3e96,
- 0x3e4e, 0x6e2a, 0x0e72, 0x2606, 0x5696,
- 0x564e, 0x0e2a, 0x2672, 0x3e06, 0x6e96,
- 0x6e4e, 0x262a, 0x3e72, 0x5606, 0x0e96,
- 0x0e50, 0x3e2c, 0x5674, 0x6e08, 0x2698,
- 0x2650, 0x562c, 0x6e74, 0x0e08, 0x3e98,
- 0x3e50, 0x6e2c, 0x0e74, 0x2608, 0x5698,
- 0x5650, 0x0e2c, 0x2674, 0x3e08, 0x6e98,
- 0x6e50, 0x262c, 0x3e74, 0x5608, 0x0e98,
- 0x0e52, 0x3e2e, 0x5676, 0x6e0a, 0x269a,
- 0x2652, 0x562e, 0x6e76, 0x0e0a, 0x3e9a,
- 0x3e52, 0x6e2e, 0x0e76, 0x260a, 0x569a,
- 0x5652, 0x0e2e, 0x2676, 0x3e0a, 0x6e9a,
- 0x6e52, 0x262e, 0x3e76, 0x560a, 0x0e9a,
- 0x0e54, 0x3e30, 0x5678, 0x6e0c, 0x269c,
- 0x2654, 0x5630, 0x6e78, 0x0e0c, 0x3e9c,
- 0x3e54, 0x6e30, 0x0e78, 0x260c, 0x569c,
- 0x5654, 0x0e30, 0x2678, 0x3e0c, 0x6e9c,
- 0x6e54, 0x2630, 0x3e78, 0x560c, 0x0e9c,
- 0x0e56, 0x3e32, 0x567a, 0x6e0e, 0x269e,
- 0x2656, 0x5632, 0x6e7a, 0x0e0e, 0x3e9e,
- 0x3e56, 0x6e32, 0x0e7a, 0x260e, 0x569e,
- 0x5656, 0x0e32, 0x267a, 0x3e0e, 0x6e9e,
- 0x6e56, 0x2632, 0x3e7a, 0x560e, 0x0e9e,
- 0x0e58, 0x3e34, 0x567c, 0x6e10, 0x063c,
- 0x2658, 0x5634, 0x6e7c, 0x0e10, 0x0678,
- 0x3e58, 0x6e34, 0x0e7c, 0x2610, 0x8228,
- 0x5658, 0x0e34, 0x267c, 0x3e10, 0x8278,
- 0x6e58, 0x2634, 0x3e7c, 0x5610, 0x0600,
- 0x1248, 0x4224, 0x5a6c, 0x7200, 0x2a90,
- 0x2a48, 0x5a24, 0x726c, 0x1200, 0x4290,
- 0x4248, 0x7224, 0x126c, 0x2a00, 0x5a90,
- 0x5a48, 0x1224, 0x2a6c, 0x4200, 0x7290,
- 0x7248, 0x2a24, 0x426c, 0x5a00, 0x1290,
- 0x124a, 0x4226, 0x5a6e, 0x7202, 0x2a92,
- 0x2a4a, 0x5a26, 0x726e, 0x1202, 0x4292,
- 0x424a, 0x7226, 0x126e, 0x2a02, 0x5a92,
- 0x5a4a, 0x1226, 0x2a6e, 0x4202, 0x7292,
- 0x724a, 0x2a26, 0x426e, 0x5a02, 0x1292,
- 0x124c, 0x4228, 0x5a70, 0x7204, 0x2a94,
- 0x2a4c, 0x5a28, 0x7270, 0x1204, 0x4294,
- 0x424c, 0x7228, 0x1270, 0x2a04, 0x5a94,
- 0x5a4c, 0x1228, 0x2a70, 0x4204, 0x7294,
- 0x724c, 0x2a28, 0x4270, 0x5a04, 0x1294,
- 0x124e, 0x422a, 0x5a72, 0x7206, 0x2a96,
- 0x2a4e, 0x5a2a, 0x7272, 0x1206, 0x4296,
- 0x424e, 0x722a, 0x1272, 0x2a06, 0x5a96,
- 0x5a4e, 0x122a, 0x2a72, 0x4206, 0x7296,
- 0x724e, 0x2a2a, 0x4272, 0x5a06, 0x1296,
- 0x1250, 0x422c, 0x5a74, 0x7208, 0x2a98,
- 0x2a50, 0x5a2c, 0x7274, 0x1208, 0x4298,
- 0x4250, 0x722c, 0x1274, 0x2a08, 0x5a98,
- 0x5a50, 0x122c, 0x2a74, 0x4208, 0x7298,
- 0x7250, 0x2a2c, 0x4274, 0x5a08, 0x1298,
- 0x1252, 0x422e, 0x5a76, 0x720a, 0x2a9a,
- 0x2a52, 0x5a2e, 0x7276, 0x120a, 0x429a,
- 0x4252, 0x722e, 0x1276, 0x2a0a, 0x5a9a,
- 0x5a52, 0x122e, 0x2a76, 0x420a, 0x729a,
- 0x7252, 0x2a2e, 0x4276, 0x5a0a, 0x129a,
- 0x1254, 0x4230, 0x5a78, 0x720c, 0x2a9c,
- 0x2a54, 0x5a30, 0x7278, 0x120c, 0x429c,
- 0x4254, 0x7230, 0x1278, 0x2a0c, 0x5a9c,
- 0x5a54, 0x1230, 0x2a78, 0x420c, 0x729c,
- 0x7254, 0x2a30, 0x4278, 0x5a0c, 0x129c,
- 0x1256, 0x4232, 0x5a7a, 0x720e, 0x2a9e,
- 0x2a56, 0x5a32, 0x727a, 0x120e, 0x429e,
- 0x4256, 0x7232, 0x127a, 0x2a0e, 0x5a9e,
- 0x5a56, 0x1232, 0x2a7a, 0x420e, 0x729e,
- 0x7256, 0x2a32, 0x427a, 0x5a0e, 0x129e,
- 0x1258, 0x4234, 0x5a7c, 0x7210, 0x0250,
- 0x2a58, 0x5a34, 0x727c, 0x1210, 0x028c,
- 0x4258, 0x7234, 0x127c, 0x2a10, 0x803c,
- 0x5a58, 0x1234, 0x2a7c, 0x4210, 0x808c,
- 0x7258, 0x2a34, 0x427c, 0x5a10, 0x0214,
- 0x1648, 0x4624, 0x5e6c, 0x7600, 0x2e90,
- 0x2e48, 0x5e24, 0x766c, 0x1600, 0x4690,
- 0x4648, 0x7624, 0x166c, 0x2e00, 0x5e90,
- 0x5e48, 0x1624, 0x2e6c, 0x4600, 0x7690,
- 0x7648, 0x2e24, 0x466c, 0x5e00, 0x1690,
- 0x164a, 0x4626, 0x5e6e, 0x7602, 0x2e92,
- 0x2e4a, 0x5e26, 0x766e, 0x1602, 0x4692,
- 0x464a, 0x7626, 0x166e, 0x2e02, 0x5e92,
- 0x5e4a, 0x1626, 0x2e6e, 0x4602, 0x7692,
- 0x764a, 0x2e26, 0x466e, 0x5e02, 0x1692,
- 0x164c, 0x4628, 0x5e70, 0x7604, 0x2e94,
- 0x2e4c, 0x5e28, 0x7670, 0x1604, 0x4694,
- 0x464c, 0x7628, 0x1670, 0x2e04, 0x5e94,
- 0x5e4c, 0x1628, 0x2e70, 0x4604, 0x7694,
- 0x764c, 0x2e28, 0x4670, 0x5e04, 0x1694,
- 0x164e, 0x462a, 0x5e72, 0x7606, 0x2e96,
- 0x2e4e, 0x5e2a, 0x7672, 0x1606, 0x4696,
- 0x464e, 0x762a, 0x1672, 0x2e06, 0x5e96,
- 0x5e4e, 0x162a, 0x2e72, 0x4606, 0x7696,
- 0x764e, 0x2e2a, 0x4672, 0x5e06, 0x1696,
- 0x1650, 0x462c, 0x5e74, 0x7608, 0x2e98,
- 0x2e50, 0x5e2c, 0x7674, 0x1608, 0x4698,
- 0x4650, 0x762c, 0x1674, 0x2e08, 0x5e98,
- 0x5e50, 0x162c, 0x2e74, 0x4608, 0x7698,
- 0x7650, 0x2e2c, 0x4674, 0x5e08, 0x1698,
- 0x1652, 0x462e, 0x5e76, 0x760a, 0x2e9a,
- 0x2e52, 0x5e2e, 0x7676, 0x160a, 0x469a,
- 0x4652, 0x762e, 0x1676, 0x2e0a, 0x5e9a,
- 0x5e52, 0x162e, 0x2e76, 0x460a, 0x769a,
- 0x7652, 0x2e2e, 0x4676, 0x5e0a, 0x169a,
- 0x1654, 0x4630, 0x5e78, 0x760c, 0x2e9c,
- 0x2e54, 0x5e30, 0x7678, 0x160c, 0x469c,
- 0x4654, 0x7630, 0x1678, 0x2e0c, 0x5e9c,
- 0x5e54, 0x1630, 0x2e78, 0x460c, 0x769c,
- 0x7654, 0x2e30, 0x4678, 0x5e0c, 0x169c,
- 0x1656, 0x4632, 0x5e7a, 0x760e, 0x2e9e,
- 0x2e56, 0x5e32, 0x767a, 0x160e, 0x469e,
- 0x4656, 0x7632, 0x167a, 0x2e0e, 0x5e9e,
- 0x5e56, 0x1632, 0x2e7a, 0x460e, 0x769e,
- 0x7656, 0x2e32, 0x467a, 0x5e0e, 0x169e,
- 0x1658, 0x4634, 0x5e7c, 0x7610, 0x0650,
- 0x2e58, 0x5e34, 0x767c, 0x1610, 0x068c,
- 0x4658, 0x7634, 0x167c, 0x2e10, 0x843c,
- 0x5e58, 0x1634, 0x2e7c, 0x4610, 0x848c,
- 0x7658, 0x2e34, 0x467c, 0x5e10, 0x0614,
- 0x1a48, 0x4a24, 0x626c, 0x7a00, 0x3290,
- 0x3248, 0x6224, 0x7a6c, 0x1a00, 0x4a90,
- 0x4a48, 0x7a24, 0x1a6c, 0x3200, 0x6290,
- 0x6248, 0x1a24, 0x326c, 0x4a00, 0x7a90,
- 0x7a48, 0x3224, 0x4a6c, 0x6200, 0x1a90,
- 0x1a4a, 0x4a26, 0x626e, 0x7a02, 0x3292,
- 0x324a, 0x6226, 0x7a6e, 0x1a02, 0x4a92,
- 0x4a4a, 0x7a26, 0x1a6e, 0x3202, 0x6292,
- 0x624a, 0x1a26, 0x326e, 0x4a02, 0x7a92,
- 0x7a4a, 0x3226, 0x4a6e, 0x6202, 0x1a92,
- 0x1a4c, 0x4a28, 0x6270, 0x7a04, 0x3294,
- 0x324c, 0x6228, 0x7a70, 0x1a04, 0x4a94,
- 0x4a4c, 0x7a28, 0x1a70, 0x3204, 0x6294,
- 0x624c, 0x1a28, 0x3270, 0x4a04, 0x7a94,
- 0x7a4c, 0x3228, 0x4a70, 0x6204, 0x1a94,
- 0x1a4e, 0x4a2a, 0x6272, 0x7a06, 0x3296,
- 0x324e, 0x622a, 0x7a72, 0x1a06, 0x4a96,
- 0x4a4e, 0x7a2a, 0x1a72, 0x3206, 0x6296,
- 0x624e, 0x1a2a, 0x3272, 0x4a06, 0x7a96,
- 0x7a4e, 0x322a, 0x4a72, 0x6206, 0x1a96,
- 0x1a50, 0x4a2c, 0x6274, 0x7a08, 0x3298,
- 0x3250, 0x622c, 0x7a74, 0x1a08, 0x4a98,
- 0x4a50, 0x7a2c, 0x1a74, 0x3208, 0x6298,
- 0x6250, 0x1a2c, 0x3274, 0x4a08, 0x7a98,
- 0x7a50, 0x322c, 0x4a74, 0x6208, 0x1a98,
- 0x1a52, 0x4a2e, 0x6276, 0x7a0a, 0x329a,
- 0x3252, 0x622e, 0x7a76, 0x1a0a, 0x4a9a,
- 0x4a52, 0x7a2e, 0x1a76, 0x320a, 0x629a,
- 0x6252, 0x1a2e, 0x3276, 0x4a0a, 0x7a9a,
- 0x7a52, 0x322e, 0x4a76, 0x620a, 0x1a9a,
- 0x1a54, 0x4a30, 0x6278, 0x7a0c, 0x329c,
- 0x3254, 0x6230, 0x7a78, 0x1a0c, 0x4a9c,
- 0x4a54, 0x7a30, 0x1a78, 0x320c, 0x629c,
- 0x6254, 0x1a30, 0x3278, 0x4a0c, 0x7a9c,
- 0x7a54, 0x3230, 0x4a78, 0x620c, 0x1a9c,
- 0x1a56, 0x4a32, 0x627a, 0x7a0e, 0x329e,
- 0x3256, 0x6232, 0x7a7a, 0x1a0e, 0x4a9e,
- 0x4a56, 0x7a32, 0x1a7a, 0x320e, 0x629e,
- 0x6256, 0x1a32, 0x327a, 0x4a0e, 0x7a9e,
- 0x7a56, 0x3232, 0x4a7a, 0x620e, 0x1a9e,
- 0x1a58, 0x4a34, 0x627c, 0x7a10, 0x0264,
- 0x3258, 0x6234, 0x7a7c, 0x1a10, 0x8200,
- 0x4a58, 0x7a34, 0x1a7c, 0x3210, 0x8250,
- 0x6258, 0x1a34, 0x327c, 0x4a10, 0x8628,
- 0x7a58, 0x3234, 0x4a7c, 0x6210, 0x0228,
- 0x1e48, 0x4e24, 0x666c, 0x7e00, 0x3690,
- 0x3648, 0x6624, 0x7e6c, 0x1e00, 0x4e90,
- 0x4e48, 0x7e24, 0x1e6c, 0x3600, 0x6690,
- 0x6648, 0x1e24, 0x366c, 0x4e00, 0x7e90,
- 0x7e48, 0x3624, 0x4e6c, 0x6600, 0x1e90,
- 0x1e4a, 0x4e26, 0x666e, 0x7e02, 0x3692,
- 0x364a, 0x6626, 0x7e6e, 0x1e02, 0x4e92,
- 0x4e4a, 0x7e26, 0x1e6e, 0x3602, 0x6692,
- 0x664a, 0x1e26, 0x366e, 0x4e02, 0x7e92,
- 0x7e4a, 0x3626, 0x4e6e, 0x6602, 0x1e92,
- 0x1e4c, 0x4e28, 0x6670, 0x7e04, 0x3694,
- 0x364c, 0x6628, 0x7e70, 0x1e04, 0x4e94,
- 0x4e4c, 0x7e28, 0x1e70, 0x3604, 0x6694,
- 0x664c, 0x1e28, 0x3670, 0x4e04, 0x7e94,
- 0x7e4c, 0x3628, 0x4e70, 0x6604, 0x1e94,
- 0x1e4e, 0x4e2a, 0x6672, 0x7e06, 0x3696,
- 0x364e, 0x662a, 0x7e72, 0x1e06, 0x4e96,
- 0x4e4e, 0x7e2a, 0x1e72, 0x3606, 0x6696,
- 0x664e, 0x1e2a, 0x3672, 0x4e06, 0x7e96,
- 0x7e4e, 0x362a, 0x4e72, 0x6606, 0x1e96,
- 0x1e50, 0x4e2c, 0x6674, 0x7e08, 0x3698,
- 0x3650, 0x662c, 0x7e74, 0x1e08, 0x4e98,
- 0x4e50, 0x7e2c, 0x1e74, 0x3608, 0x6698,
- 0x6650, 0x1e2c, 0x3674, 0x4e08, 0x7e98,
- 0x7e50, 0x362c, 0x4e74, 0x6608, 0x1e98,
- 0x1e52, 0x4e2e, 0x6676, 0x7e0a, 0x369a,
- 0x3652, 0x662e, 0x7e76, 0x1e0a, 0x4e9a,
- 0x4e52, 0x7e2e, 0x1e76, 0x360a, 0x669a,
- 0x6652, 0x1e2e, 0x3676, 0x4e0a, 0x7e9a,
- 0x7e52, 0x362e, 0x4e76, 0x660a, 0x1e9a,
- 0x1e54, 0x4e30, 0x6678, 0x7e0c, 0x369c,
- 0x3654, 0x6630, 0x7e78, 0x1e0c, 0x4e9c,
- 0x4e54, 0x7e30, 0x1e78, 0x360c, 0x669c,
- 0x6654, 0x1e30, 0x3678, 0x4e0c, 0x7e9c,
- 0x7e54, 0x3630, 0x4e78, 0x660c, 0x1e9c,
- 0x1e56, 0x4e32, 0x667a, 0x7e0e, 0x369e,
- 0x3656, 0x6632, 0x7e7a, 0x1e0e, 0x4e9e,
- 0x4e56, 0x7e32, 0x1e7a, 0x360e, 0x669e,
- 0x6656, 0x1e32, 0x367a, 0x4e0e, 0x7e9e,
- 0x7e56, 0x3632, 0x4e7a, 0x660e, 0x1e9e,
- 0x1e58, 0x4e34, 0x667c, 0x7e10, 0x0664,
- 0x3658, 0x6634, 0x7e7c, 0x1e10, 0x8014,
- 0x4e58, 0x7e34, 0x1e7c, 0x3610, 0x8064,
- 0x6658, 0x1e34, 0x367c, 0x4e10, 0x8678,
- 0x7e58, 0x3634, 0x4e7c, 0x6610, 0x0628,
- 0x3a5a, 0x6a36, 0x0a7e, 0x2212, 0x8416,
- 0x525a, 0x0a36, 0x227e, 0x3a12, 0x8466,
- 0x6a5a, 0x2236, 0x3a7e, 0x5212, 0x0202,
- 0x0a5a, 0x3a36, 0x527e, 0x6a12, 0x023e,
- 0x225a, 0x5236, 0x6a7e, 0x0a12, 0x027a,
- 0x3a5c, 0x6a38, 0x0a80, 0x2214, 0x8418,
- 0x525c, 0x0a38, 0x2280, 0x3a14, 0x8468,
- 0x6a5c, 0x2238, 0x3a80, 0x5214, 0x0204,
- 0x0a5c, 0x3a38, 0x5280, 0x6a14, 0x0240,
- 0x225c, 0x5238, 0x6a80, 0x0a14, 0x027c,
- 0x3a5e, 0x6a3a, 0x0a82, 0x2216, 0x841a,
- 0x525e, 0x0a3a, 0x2282, 0x3a16, 0x846a,
- 0x6a5e, 0x223a, 0x3a82, 0x5216, 0x0206,
- 0x0a5e, 0x3a3a, 0x5282, 0x6a16, 0x0242,
- 0x225e, 0x523a, 0x6a82, 0x0a16, 0x027e,
- 0x3a60, 0x6a3c, 0x0a84, 0x2218, 0x841c,
- 0x5260, 0x0a3c, 0x2284, 0x3a18, 0x846c,
- 0x6a60, 0x223c, 0x3a84, 0x5218, 0x0208,
- 0x0a60, 0x3a3c, 0x5284, 0x6a18, 0x0244,
- 0x2260, 0x523c, 0x6a84, 0x0a18, 0x0280,
- 0x3a62, 0x6a3e, 0x0a86, 0x221a, 0x841e,
- 0x5262, 0x0a3e, 0x2286, 0x3a1a, 0x846e,
- 0x6a62, 0x223e, 0x3a86, 0x521a, 0x020a,
- 0x0a62, 0x3a3e, 0x5286, 0x6a1a, 0x0246,
- 0x2262, 0x523e, 0x6a86, 0x0a1a, 0x0282,
- 0x3a64, 0x6a40, 0x0a88, 0x221c, 0x8420,
- 0x5264, 0x0a40, 0x2288, 0x3a1c, 0x8470,
- 0x6a64, 0x2240, 0x3a88, 0x521c, 0x020c,
- 0x0a64, 0x3a40, 0x5288, 0x6a1c, 0x0248,
- 0x2264, 0x5240, 0x6a88, 0x0a1c, 0x0284,
- 0x3a66, 0x6a42, 0x0a8a, 0x221e, 0x8422,
- 0x5266, 0x0a42, 0x228a, 0x3a1e, 0x8472,
- 0x6a66, 0x2242, 0x3a8a, 0x521e, 0x020e,
- 0x0a66, 0x3a42, 0x528a, 0x6a1e, 0x024a,
- 0x2266, 0x5242, 0x6a8a, 0x0a1e, 0x0286,
- 0x3a68, 0x6a44, 0x0a8c, 0x2220, 0x8424,
- 0x5268, 0x0a44, 0x228c, 0x3a20, 0x8474,
- 0x6a68, 0x2244, 0x3a8c, 0x5220, 0x0210,
- 0x0a68, 0x3a44, 0x528c, 0x6a20, 0x024c,
- 0x2268, 0x5244, 0x6a8c, 0x0a20, 0x0288,
- 0x3a6a, 0x6a46, 0x0a8e, 0x2222, 0x8426,
- 0x526a, 0x0a46, 0x228e, 0x3a22, 0x8476,
- 0x6a6a, 0x2246, 0x3a8e, 0x5222, 0x0212,
- 0x0a6a, 0x3a46, 0x528e, 0x6a22, 0x024e,
- 0x226a, 0x5246, 0x6a8e, 0x0a22, 0x028a,
- 0x3e5a, 0x6e36, 0x0e7e, 0x2612, 0x822a,
- 0x565a, 0x0e36, 0x267e, 0x3e12, 0x827a,
- 0x6e5a, 0x2636, 0x3e7e, 0x5612, 0x0602,
- 0x0e5a, 0x3e36, 0x567e, 0x6e12, 0x063e,
- 0x265a, 0x5636, 0x6e7e, 0x0e12, 0x067a,
- 0x3e5c, 0x6e38, 0x0e80, 0x2614, 0x822c,
- 0x565c, 0x0e38, 0x2680, 0x3e14, 0x827c,
- 0x6e5c, 0x2638, 0x3e80, 0x5614, 0x0604,
- 0x0e5c, 0x3e38, 0x5680, 0x6e14, 0x0640,
- 0x265c, 0x5638, 0x6e80, 0x0e14, 0x067c,
- 0x3e5e, 0x6e3a, 0x0e82, 0x2616, 0x822e,
- 0x565e, 0x0e3a, 0x2682, 0x3e16, 0x827e,
- 0x6e5e, 0x263a, 0x3e82, 0x5616, 0x0606,
- 0x0e5e, 0x3e3a, 0x5682, 0x6e16, 0x0642,
- 0x265e, 0x563a, 0x6e82, 0x0e16, 0x067e,
- 0x3e60, 0x6e3c, 0x0e84, 0x2618, 0x8230,
- 0x5660, 0x0e3c, 0x2684, 0x3e18, 0x8280,
- 0x6e60, 0x263c, 0x3e84, 0x5618, 0x0608,
- 0x0e60, 0x3e3c, 0x5684, 0x6e18, 0x0644,
- 0x2660, 0x563c, 0x6e84, 0x0e18, 0x0680,
- 0x3e62, 0x6e3e, 0x0e86, 0x261a, 0x8232,
- 0x5662, 0x0e3e, 0x2686, 0x3e1a, 0x8282,
- 0x6e62, 0x263e, 0x3e86, 0x561a, 0x060a,
- 0x0e62, 0x3e3e, 0x5686, 0x6e1a, 0x0646,
- 0x2662, 0x563e, 0x6e86, 0x0e1a, 0x0682,
- 0x3e64, 0x6e40, 0x0e88, 0x261c, 0x8234,
- 0x5664, 0x0e40, 0x2688, 0x3e1c, 0x8284,
- 0x6e64, 0x2640, 0x3e88, 0x561c, 0x060c,
- 0x0e64, 0x3e40, 0x5688, 0x6e1c, 0x0648,
- 0x2664, 0x5640, 0x6e88, 0x0e1c, 0x0684,
- 0x3e66, 0x6e42, 0x0e8a, 0x261e, 0x8236,
- 0x5666, 0x0e42, 0x268a, 0x3e1e, 0x8286,
- 0x6e66, 0x2642, 0x3e8a, 0x561e, 0x060e,
- 0x0e66, 0x3e42, 0x568a, 0x6e1e, 0x064a,
- 0x2666, 0x5642, 0x6e8a, 0x0e1e, 0x0686,
- 0x3e68, 0x6e44, 0x0e8c, 0x2620, 0x8238,
- 0x5668, 0x0e44, 0x268c, 0x3e20, 0x8288,
- 0x6e68, 0x2644, 0x3e8c, 0x5620, 0x0610,
- 0x0e68, 0x3e44, 0x568c, 0x6e20, 0x064c,
- 0x2668, 0x5644, 0x6e8c, 0x0e20, 0x0688,
- 0x3e6a, 0x6e46, 0x0e8e, 0x2622, 0x823a,
- 0x566a, 0x0e46, 0x268e, 0x3e22, 0x828a,
- 0x6e6a, 0x2646, 0x3e8e, 0x5622, 0x0612,
- 0x0e6a, 0x3e46, 0x568e, 0x6e22, 0x064e,
- 0x266a, 0x5646, 0x6e8e, 0x0e22, 0x068a,
- 0x425a, 0x7236, 0x127e, 0x2a12, 0x803e,
- 0x5a5a, 0x1236, 0x2a7e, 0x4212, 0x808e,
- 0x725a, 0x2a36, 0x427e, 0x5a12, 0x0216,
- 0x125a, 0x4236, 0x5a7e, 0x7212, 0x0252,
- 0x2a5a, 0x5a36, 0x727e, 0x1212, 0x028e,
- 0x425c, 0x7238, 0x1280, 0x2a14, 0x8040,
- 0x5a5c, 0x1238, 0x2a80, 0x4214, 0x8090,
- 0x725c, 0x2a38, 0x4280, 0x5a14, 0x0218,
- 0x125c, 0x4238, 0x5a80, 0x7214, 0x0254,
- 0x2a5c, 0x5a38, 0x7280, 0x1214, 0x0290,
- 0x425e, 0x723a, 0x1282, 0x2a16, 0x8042,
- 0x5a5e, 0x123a, 0x2a82, 0x4216, 0x8092,
- 0x725e, 0x2a3a, 0x4282, 0x5a16, 0x021a,
- 0x125e, 0x423a, 0x5a82, 0x7216, 0x0256,
- 0x2a5e, 0x5a3a, 0x7282, 0x1216, 0x0292,
- 0x4260, 0x723c, 0x1284, 0x2a18, 0x8044,
- 0x5a60, 0x123c, 0x2a84, 0x4218, 0x8094,
- 0x7260, 0x2a3c, 0x4284, 0x5a18, 0x021c,
- 0x1260, 0x423c, 0x5a84, 0x7218, 0x0258,
- 0x2a60, 0x5a3c, 0x7284, 0x1218, 0x0294,
- 0x4262, 0x723e, 0x1286, 0x2a1a, 0x8046,
- 0x5a62, 0x123e, 0x2a86, 0x421a, 0x8096,
- 0x7262, 0x2a3e, 0x4286, 0x5a1a, 0x021e,
- 0x1262, 0x423e, 0x5a86, 0x721a, 0x025a,
- 0x2a62, 0x5a3e, 0x7286, 0x121a, 0x0296,
- 0x4264, 0x7240, 0x1288, 0x2a1c, 0x8048,
- 0x5a64, 0x1240, 0x2a88, 0x421c, 0x8098,
- 0x7264, 0x2a40, 0x4288, 0x5a1c, 0x0220,
- 0x1264, 0x4240, 0x5a88, 0x721c, 0x025c,
- 0x2a64, 0x5a40, 0x7288, 0x121c, 0x0298,
- 0x4266, 0x7242, 0x128a, 0x2a1e, 0x804a,
- 0x5a66, 0x1242, 0x2a8a, 0x421e, 0x809a,
- 0x7266, 0x2a42, 0x428a, 0x5a1e, 0x0222,
- 0x1266, 0x4242, 0x5a8a, 0x721e, 0x025e,
- 0x2a66, 0x5a42, 0x728a, 0x121e, 0x029a,
- 0x4268, 0x7244, 0x128c, 0x2a20, 0x804c,
- 0x5a68, 0x1244, 0x2a8c, 0x4220, 0x809c,
- 0x7268, 0x2a44, 0x428c, 0x5a20, 0x0224,
- 0x1268, 0x4244, 0x5a8c, 0x7220, 0x0260,
- 0x2a68, 0x5a44, 0x728c, 0x1220, 0x029c,
- 0x426a, 0x7246, 0x128e, 0x2a22, 0x804e,
- 0x5a6a, 0x1246, 0x2a8e, 0x4222, 0x809e,
- 0x726a, 0x2a46, 0x428e, 0x5a22, 0x0226,
- 0x126a, 0x4246, 0x5a8e, 0x7222, 0x0262,
- 0x2a6a, 0x5a46, 0x728e, 0x1222, 0x029e,
- 0x465a, 0x7636, 0x167e, 0x2e12, 0x843e,
- 0x5e5a, 0x1636, 0x2e7e, 0x4612, 0x848e,
- 0x765a, 0x2e36, 0x467e, 0x5e12, 0x0616,
- 0x165a, 0x4636, 0x5e7e, 0x7612, 0x0652,
- 0x2e5a, 0x5e36, 0x767e, 0x1612, 0x068e,
- 0x465c, 0x7638, 0x1680, 0x2e14, 0x8440,
- 0x5e5c, 0x1638, 0x2e80, 0x4614, 0x8490,
- 0x765c, 0x2e38, 0x4680, 0x5e14, 0x0618,
- 0x165c, 0x4638, 0x5e80, 0x7614, 0x0654,
- 0x2e5c, 0x5e38, 0x7680, 0x1614, 0x0690,
- 0x465e, 0x763a, 0x1682, 0x2e16, 0x8442,
- 0x5e5e, 0x163a, 0x2e82, 0x4616, 0x8492,
- 0x765e, 0x2e3a, 0x4682, 0x5e16, 0x061a,
- 0x165e, 0x463a, 0x5e82, 0x7616, 0x0656,
- 0x2e5e, 0x5e3a, 0x7682, 0x1616, 0x0692,
- 0x4660, 0x763c, 0x1684, 0x2e18, 0x8444,
- 0x5e60, 0x163c, 0x2e84, 0x4618, 0x8494,
- 0x7660, 0x2e3c, 0x4684, 0x5e18, 0x061c,
- 0x1660, 0x463c, 0x5e84, 0x7618, 0x0658,
- 0x2e60, 0x5e3c, 0x7684, 0x1618, 0x0694,
- 0x4662, 0x763e, 0x1686, 0x2e1a, 0x8446,
- 0x5e62, 0x163e, 0x2e86, 0x461a, 0x8496,
- 0x7662, 0x2e3e, 0x4686, 0x5e1a, 0x061e,
- 0x1662, 0x463e, 0x5e86, 0x761a, 0x065a,
- 0x2e62, 0x5e3e, 0x7686, 0x161a, 0x0696,
- 0x4664, 0x7640, 0x1688, 0x2e1c, 0x8448,
- 0x5e64, 0x1640, 0x2e88, 0x461c, 0x8498,
- 0x7664, 0x2e40, 0x4688, 0x5e1c, 0x0620,
- 0x1664, 0x4640, 0x5e88, 0x761c, 0x065c,
- 0x2e64, 0x5e40, 0x7688, 0x161c, 0x0698,
- 0x4666, 0x7642, 0x168a, 0x2e1e, 0x844a,
- 0x5e66, 0x1642, 0x2e8a, 0x461e, 0x849a,
- 0x7666, 0x2e42, 0x468a, 0x5e1e, 0x0622,
- 0x1666, 0x4642, 0x5e8a, 0x761e, 0x065e,
- 0x2e66, 0x5e42, 0x768a, 0x161e, 0x069a,
- 0x4668, 0x7644, 0x168c, 0x2e20, 0x844c,
- 0x5e68, 0x1644, 0x2e8c, 0x4620, 0x849c,
- 0x7668, 0x2e44, 0x468c, 0x5e20, 0x0624,
- 0x1668, 0x4644, 0x5e8c, 0x7620, 0x0660,
- 0x2e68, 0x5e44, 0x768c, 0x1620, 0x069c,
- 0x466a, 0x7646, 0x168e, 0x2e22, 0x844e,
- 0x5e6a, 0x1646, 0x2e8e, 0x4622, 0x849e,
- 0x766a, 0x2e46, 0x468e, 0x5e22, 0x0626,
- 0x166a, 0x4646, 0x5e8e, 0x7622, 0x0662,
- 0x2e6a, 0x5e46, 0x768e, 0x1622, 0x069e,
- 0x4a5a, 0x7a36, 0x1a7e, 0x3212, 0x8252,
- 0x625a, 0x1a36, 0x327e, 0x4a12, 0x862c,
- 0x7a5a, 0x3236, 0x4a7e, 0x6212, 0x022a,
- 0x1a5a, 0x4a36, 0x627e, 0x7a12, 0x0266,
- 0x325a, 0x6236, 0x7a7e, 0x1a12, 0x8202,
- 0x4a5c, 0x7a38, 0x1a80, 0x3214, 0x8254,
- 0x625c, 0x1a38, 0x3280, 0x4a14, 0x8630,
- 0x7a5c, 0x3238, 0x4a80, 0x6214, 0x022c,
- 0x1a5c, 0x4a38, 0x6280, 0x7a14, 0x0268,
- 0x325c, 0x6238, 0x7a80, 0x1a14, 0x8204,
- 0x4a5e, 0x7a3a, 0x1a82, 0x3216, 0x8256,
- 0x625e, 0x1a3a, 0x3282, 0x4a16, 0x8634,
- 0x7a5e, 0x323a, 0x4a82, 0x6216, 0x022e,
- 0x1a5e, 0x4a3a, 0x6282, 0x7a16, 0x026a,
- 0x325e, 0x623a, 0x7a82, 0x1a16, 0x8206,
- 0x4a60, 0x7a3c, 0x1a84, 0x3218, 0x8258,
- 0x6260, 0x1a3c, 0x3284, 0x4a18, 0x8638,
- 0x7a60, 0x323c, 0x4a84, 0x6218, 0x0230,
- 0x1a60, 0x4a3c, 0x6284, 0x7a18, 0x026c,
- 0x3260, 0x623c, 0x7a84, 0x1a18, 0x8208,
- 0x4a62, 0x7a3e, 0x1a86, 0x321a, 0x825a,
- 0x6262, 0x1a3e, 0x3286, 0x4a1a, 0x863c,
- 0x7a62, 0x323e, 0x4a86, 0x621a, 0x0232,
- 0x1a62, 0x4a3e, 0x6286, 0x7a1a, 0x026e,
- 0x3262, 0x623e, 0x7a86, 0x1a1a, 0x820a,
- 0x4a64, 0x7a40, 0x1a88, 0x321c, 0x825c,
- 0x6264, 0x1a40, 0x3288, 0x4a1c, 0x8640,
- 0x7a64, 0x3240, 0x4a88, 0x621c, 0x0234,
- 0x1a64, 0x4a40, 0x6288, 0x7a1c, 0x0270,
- 0x3264, 0x6240, 0x7a88, 0x1a1c, 0x820c,
- 0x4a66, 0x7a42, 0x1a8a, 0x321e, 0x825e,
- 0x6266, 0x1a42, 0x328a, 0x4a1e, 0x8644,
- 0x7a66, 0x3242, 0x4a8a, 0x621e, 0x0236,
- 0x1a66, 0x4a42, 0x628a, 0x7a1e, 0x0272,
- 0x3266, 0x6242, 0x7a8a, 0x1a1e, 0x820e,
- 0x4a68, 0x7a44, 0x1a8c, 0x3220, 0x8260,
- 0x6268, 0x1a44, 0x328c, 0x4a20, 0x8648,
- 0x7a68, 0x3244, 0x4a8c, 0x6220, 0x0238,
- 0x1a68, 0x4a44, 0x628c, 0x7a20, 0x0274,
- 0x3268, 0x6244, 0x7a8c, 0x1a20, 0x8210,
- 0x4a6a, 0x7a46, 0x1a8e, 0x3222, 0x8262,
- 0x626a, 0x1a46, 0x328e, 0x4a22, 0x864c,
- 0x7a6a, 0x3246, 0x4a8e, 0x6222, 0x023a,
- 0x1a6a, 0x4a46, 0x628e, 0x7a22, 0x0276,
- 0x326a, 0x6246, 0x7a8e, 0x1a22, 0x8212,
- 0x4e5a, 0x7e36, 0x1e7e, 0x3612, 0x8066,
- 0x665a, 0x1e36, 0x367e, 0x4e12, 0x867c,
- 0x7e5a, 0x3636, 0x4e7e, 0x6612, 0x062a,
- 0x1e5a, 0x4e36, 0x667e, 0x7e12, 0x0666,
- 0x365a, 0x6636, 0x7e7e, 0x1e12, 0x8016,
- 0x4e5c, 0x7e38, 0x1e80, 0x3614, 0x8068,
- 0x665c, 0x1e38, 0x3680, 0x4e14, 0x8680,
- 0x7e5c, 0x3638, 0x4e80, 0x6614, 0x062c,
- 0x1e5c, 0x4e38, 0x6680, 0x7e14, 0x0668,
- 0x365c, 0x6638, 0x7e80, 0x1e14, 0x8018,
- 0x4e5e, 0x7e3a, 0x1e82, 0x3616, 0x806a,
- 0x665e, 0x1e3a, 0x3682, 0x4e16, 0x8684,
- 0x7e5e, 0x363a, 0x4e82, 0x6616, 0x062e,
- 0x1e5e, 0x4e3a, 0x6682, 0x7e16, 0x066a,
- 0x365e, 0x663a, 0x7e82, 0x1e16, 0x801a,
- 0x4e60, 0x7e3c, 0x1e84, 0x3618, 0x806c,
- 0x6660, 0x1e3c, 0x3684, 0x4e18, 0x8688,
- 0x7e60, 0x363c, 0x4e84, 0x6618, 0x0630,
- 0x1e60, 0x4e3c, 0x6684, 0x7e18, 0x066c,
- 0x3660, 0x663c, 0x7e84, 0x1e18, 0x801c,
- 0x4e62, 0x7e3e, 0x1e86, 0x361a, 0x806e,
- 0x6662, 0x1e3e, 0x3686, 0x4e1a, 0x868c,
- 0x7e62, 0x363e, 0x4e86, 0x661a, 0x0632,
- 0x1e62, 0x4e3e, 0x6686, 0x7e1a, 0x066e,
- 0x3662, 0x663e, 0x7e86, 0x1e1a, 0x801e,
- 0x4e64, 0x7e40, 0x1e88, 0x361c, 0x8070,
- 0x6664, 0x1e40, 0x3688, 0x4e1c, 0x8690,
- 0x7e64, 0x3640, 0x4e88, 0x661c, 0x0634,
- 0x1e64, 0x4e40, 0x6688, 0x7e1c, 0x0670,
- 0x3664, 0x6640, 0x7e88, 0x1e1c, 0x8020,
- 0x4e66, 0x7e42, 0x1e8a, 0x361e, 0x8072,
- 0x6666, 0x1e42, 0x368a, 0x4e1e, 0x8694,
- 0x7e66, 0x3642, 0x4e8a, 0x661e, 0x0636,
- 0x1e66, 0x4e42, 0x668a, 0x7e1e, 0x0672,
- 0x3666, 0x6642, 0x7e8a, 0x1e1e, 0x8022,
- 0x4e68, 0x7e44, 0x1e8c, 0x3620, 0x8074,
- 0x6668, 0x1e44, 0x368c, 0x4e20, 0x8698,
- 0x7e68, 0x3644, 0x4e8c, 0x6620, 0x0638,
- 0x1e68, 0x4e44, 0x668c, 0x7e20, 0x0674,
- 0x3668, 0x6644, 0x7e8c, 0x1e20, 0x8024,
- 0x4e6a, 0x7e46, 0x1e8e, 0x3622, 0x8076,
- 0x666a, 0x1e46, 0x368e, 0x4e22, 0x869c,
- 0x7e6a, 0x3646, 0x4e8e, 0x6622, 0x063a,
- 0x1e6a, 0x4e46, 0x668e, 0x7e22, 0x0676,
- 0x366a, 0x6646, 0x7e8e, 0x1e22, 0x8026,
-};
-
-static const uint16_t dv_place_1080i50[4*12*27*5] = {
- 0x1a48, 0x4a24, 0x626c, 0x0200, 0x3290,
- 0x2648, 0x5624, 0x6e6c, 0x0e00, 0x3e90,
- 0x3248, 0x6224, 0x7a6c, 0x1a00, 0x4a90,
- 0x3e48, 0x6e24, 0x026c, 0x2600, 0x5690,
- 0x4a48, 0x7a24, 0x0e6c, 0x3200, 0x6290,
- 0x5648, 0x0224, 0x1a6c, 0x3e00, 0x6e90,
- 0x6248, 0x0e24, 0x266c, 0x4a00, 0x7a90,
- 0x6e48, 0x1a24, 0x326c, 0x5600, 0x0290,
- 0x7a48, 0x2624, 0x3e6c, 0x6200, 0x0e90,
- 0x0248, 0x3224, 0x4a6c, 0x6e00, 0x1a90,
- 0x0e48, 0x3e24, 0x566c, 0x7a00, 0x2690,
- 0x1a4a, 0x4a26, 0x626e, 0x0202, 0x3292,
- 0x264a, 0x5626, 0x6e6e, 0x0e02, 0x3e92,
- 0x324a, 0x6226, 0x7a6e, 0x1a02, 0x4a92,
- 0x3e4a, 0x6e26, 0x026e, 0x2602, 0x5692,
- 0x4a4a, 0x7a26, 0x0e6e, 0x3202, 0x6292,
- 0x564a, 0x0226, 0x1a6e, 0x3e02, 0x6e92,
- 0x624a, 0x0e26, 0x266e, 0x4a02, 0x7a92,
- 0x6e4a, 0x1a26, 0x326e, 0x5602, 0x0292,
- 0x7a4a, 0x2626, 0x3e6e, 0x6202, 0x0e92,
- 0x024a, 0x3226, 0x4a6e, 0x6e02, 0x1a92,
- 0x0e4a, 0x3e26, 0x566e, 0x7a02, 0x2692,
- 0x1a4c, 0x4a28, 0x6270, 0x0204, 0x3294,
- 0x264c, 0x5628, 0x6e70, 0x0e04, 0x3e94,
- 0x324c, 0x6228, 0x7a70, 0x1a04, 0x4a94,
- 0x3e4c, 0x6e28, 0x0270, 0x2604, 0x5694,
- 0x4a4c, 0x7a28, 0x0e70, 0x3204, 0x6294,
- 0x564c, 0x0228, 0x1a70, 0x3e04, 0x6e94,
- 0x624c, 0x0e28, 0x2670, 0x4a04, 0x7a94,
- 0x6e4c, 0x1a28, 0x3270, 0x5604, 0x0294,
- 0x7a4c, 0x2628, 0x3e70, 0x6204, 0x0e94,
- 0x024c, 0x3228, 0x4a70, 0x6e04, 0x1a94,
- 0x0e4c, 0x3e28, 0x5670, 0x7a04, 0x2694,
- 0x1a4e, 0x4a2a, 0x6272, 0x0206, 0x3296,
- 0x264e, 0x562a, 0x6e72, 0x0e06, 0x3e96,
- 0x324e, 0x622a, 0x7a72, 0x1a06, 0x4a96,
- 0x3e4e, 0x6e2a, 0x0272, 0x2606, 0x5696,
- 0x4a4e, 0x7a2a, 0x0e72, 0x3206, 0x6296,
- 0x564e, 0x022a, 0x1a72, 0x3e06, 0x6e96,
- 0x624e, 0x0e2a, 0x2672, 0x4a06, 0x7a96,
- 0x6e4e, 0x1a2a, 0x3272, 0x5606, 0x0296,
- 0x7a4e, 0x262a, 0x3e72, 0x6206, 0x0e96,
- 0x024e, 0x322a, 0x4a72, 0x6e06, 0x1a96,
- 0x0e4e, 0x3e2a, 0x5672, 0x7a06, 0x2696,
- 0x1a50, 0x4a2c, 0x6274, 0x0208, 0x3298,
- 0x2650, 0x562c, 0x6e74, 0x0e08, 0x3e98,
- 0x3250, 0x622c, 0x7a74, 0x1a08, 0x4a98,
- 0x3e50, 0x6e2c, 0x0274, 0x2608, 0x5698,
- 0x4a50, 0x7a2c, 0x0e74, 0x3208, 0x6298,
- 0x5650, 0x022c, 0x1a74, 0x3e08, 0x6e98,
- 0x6250, 0x0e2c, 0x2674, 0x4a08, 0x7a98,
- 0x6e50, 0x1a2c, 0x3274, 0x5608, 0x0298,
- 0x7a50, 0x262c, 0x3e74, 0x6208, 0x0e98,
- 0x0250, 0x322c, 0x4a74, 0x6e08, 0x1a98,
- 0x0e50, 0x3e2c, 0x5674, 0x7a08, 0x2698,
- 0x1a52, 0x4a2e, 0x6276, 0x020a, 0x329a,
- 0x2652, 0x562e, 0x6e76, 0x0e0a, 0x3e9a,
- 0x3252, 0x622e, 0x7a76, 0x1a0a, 0x4a9a,
- 0x3e52, 0x6e2e, 0x0276, 0x260a, 0x569a,
- 0x4a52, 0x7a2e, 0x0e76, 0x320a, 0x629a,
- 0x5652, 0x022e, 0x1a76, 0x3e0a, 0x6e9a,
- 0x6252, 0x0e2e, 0x2676, 0x4a0a, 0x7a9a,
- 0x6e52, 0x1a2e, 0x3276, 0x560a, 0x029a,
- 0x7a52, 0x262e, 0x3e76, 0x620a, 0x0e9a,
- 0x0252, 0x322e, 0x4a76, 0x6e0a, 0x1a9a,
- 0x0e52, 0x3e2e, 0x5676, 0x7a0a, 0x269a,
- 0x1a54, 0x4a30, 0x6278, 0x020c, 0x329c,
- 0x2654, 0x5630, 0x6e78, 0x0e0c, 0x3e9c,
- 0x3254, 0x6230, 0x7a78, 0x1a0c, 0x4a9c,
- 0x3e54, 0x6e30, 0x0278, 0x260c, 0x569c,
- 0x4a54, 0x7a30, 0x0e78, 0x320c, 0x629c,
- 0x5654, 0x0230, 0x1a78, 0x3e0c, 0x6e9c,
- 0x6254, 0x0e30, 0x2678, 0x4a0c, 0x7a9c,
- 0x6e54, 0x1a30, 0x3278, 0x560c, 0x029c,
- 0x7a54, 0x2630, 0x3e78, 0x620c, 0x0e9c,
- 0x0254, 0x3230, 0x4a78, 0x6e0c, 0x1a9c,
- 0x0e54, 0x3e30, 0x5678, 0x7a0c, 0x269c,
- 0x1a56, 0x4a32, 0x627a, 0x020e, 0x329e,
- 0x2656, 0x5632, 0x6e7a, 0x0e0e, 0x3e9e,
- 0x3256, 0x6232, 0x7a7a, 0x1a0e, 0x4a9e,
- 0x3e56, 0x6e32, 0x027a, 0x260e, 0x569e,
- 0x4a56, 0x7a32, 0x0e7a, 0x320e, 0x629e,
- 0x5656, 0x0232, 0x1a7a, 0x3e0e, 0x6e9e,
- 0x6256, 0x0e32, 0x267a, 0x4a0e, 0x7a9e,
- 0x6e56, 0x1a32, 0x327a, 0x560e, 0x029e,
- 0x7a56, 0x2632, 0x3e7a, 0x620e, 0x0e9e,
- 0x0256, 0x3232, 0x4a7a, 0x6e0e, 0x1a9e,
- 0x0e56, 0x3e32, 0x567a, 0x7a0e, 0x269e,
- 0x1a58, 0x4a34, 0x627c, 0x0210, 0x32a0,
- 0x2658, 0x5634, 0x6e7c, 0x0e10, 0x3ea0,
- 0x3258, 0x6234, 0x7a7c, 0x1a10, 0x4aa0,
- 0x3e58, 0x6e34, 0x027c, 0x2610, 0x56a0,
- 0x4a58, 0x7a34, 0x0e7c, 0x3210, 0x62a0,
- 0x5658, 0x0234, 0x1a7c, 0x3e10, 0x6ea0,
- 0x6258, 0x0e34, 0x267c, 0x4a10, 0x7aa0,
- 0x6e58, 0x1a34, 0x327c, 0x5610, 0x02a0,
- 0x7a58, 0x2634, 0x3e7c, 0x6210, 0x0ea0,
- 0x0258, 0x3234, 0x4a7c, 0x6e10, 0x1aa0,
- 0x0e58, 0x3e34, 0x567c, 0x7a10, 0x26a0,
- 0x1e48, 0x4e24, 0x666c, 0x0600, 0x3690,
- 0x2a48, 0x5a24, 0x726c, 0x1200, 0x4290,
- 0x3648, 0x6624, 0x7e6c, 0x1e00, 0x4e90,
- 0x4248, 0x7224, 0x066c, 0x2a00, 0x5a90,
- 0x4e48, 0x7e24, 0x126c, 0x3600, 0x6690,
- 0x5a48, 0x0624, 0x1e6c, 0x4200, 0x7290,
- 0x6648, 0x1224, 0x2a6c, 0x4e00, 0x7e90,
- 0x7248, 0x1e24, 0x366c, 0x5a00, 0x0690,
- 0x7e48, 0x2a24, 0x426c, 0x6600, 0x1290,
- 0x0648, 0x3624, 0x4e6c, 0x7200, 0x1e90,
- 0x1248, 0x4224, 0x5a6c, 0x7e00, 0x2a90,
- 0x1e4a, 0x4e26, 0x666e, 0x0602, 0x3692,
- 0x2a4a, 0x5a26, 0x726e, 0x1202, 0x4292,
- 0x364a, 0x6626, 0x7e6e, 0x1e02, 0x4e92,
- 0x424a, 0x7226, 0x066e, 0x2a02, 0x5a92,
- 0x4e4a, 0x7e26, 0x126e, 0x3602, 0x6692,
- 0x5a4a, 0x0626, 0x1e6e, 0x4202, 0x7292,
- 0x664a, 0x1226, 0x2a6e, 0x4e02, 0x7e92,
- 0x724a, 0x1e26, 0x366e, 0x5a02, 0x0692,
- 0x7e4a, 0x2a26, 0x426e, 0x6602, 0x1292,
- 0x064a, 0x3626, 0x4e6e, 0x7202, 0x1e92,
- 0x124a, 0x4226, 0x5a6e, 0x7e02, 0x2a92,
- 0x1e4c, 0x4e28, 0x6670, 0x0604, 0x3694,
- 0x2a4c, 0x5a28, 0x7270, 0x1204, 0x4294,
- 0x364c, 0x6628, 0x7e70, 0x1e04, 0x4e94,
- 0x424c, 0x7228, 0x0670, 0x2a04, 0x5a94,
- 0x4e4c, 0x7e28, 0x1270, 0x3604, 0x6694,
- 0x5a4c, 0x0628, 0x1e70, 0x4204, 0x7294,
- 0x664c, 0x1228, 0x2a70, 0x4e04, 0x7e94,
- 0x724c, 0x1e28, 0x3670, 0x5a04, 0x0694,
- 0x7e4c, 0x2a28, 0x4270, 0x6604, 0x1294,
- 0x064c, 0x3628, 0x4e70, 0x7204, 0x1e94,
- 0x124c, 0x4228, 0x5a70, 0x7e04, 0x2a94,
- 0x1e4e, 0x4e2a, 0x6672, 0x0606, 0x3696,
- 0x2a4e, 0x5a2a, 0x7272, 0x1206, 0x4296,
- 0x364e, 0x662a, 0x7e72, 0x1e06, 0x4e96,
- 0x424e, 0x722a, 0x0672, 0x2a06, 0x5a96,
- 0x4e4e, 0x7e2a, 0x1272, 0x3606, 0x6696,
- 0x5a4e, 0x062a, 0x1e72, 0x4206, 0x7296,
- 0x664e, 0x122a, 0x2a72, 0x4e06, 0x7e96,
- 0x724e, 0x1e2a, 0x3672, 0x5a06, 0x0696,
- 0x7e4e, 0x2a2a, 0x4272, 0x6606, 0x1296,
- 0x064e, 0x362a, 0x4e72, 0x7206, 0x1e96,
- 0x124e, 0x422a, 0x5a72, 0x7e06, 0x2a96,
- 0x1e50, 0x4e2c, 0x6674, 0x0608, 0x3698,
- 0x2a50, 0x5a2c, 0x7274, 0x1208, 0x4298,
- 0x3650, 0x662c, 0x7e74, 0x1e08, 0x4e98,
- 0x4250, 0x722c, 0x0674, 0x2a08, 0x5a98,
- 0x4e50, 0x7e2c, 0x1274, 0x3608, 0x6698,
- 0x5a50, 0x062c, 0x1e74, 0x4208, 0x7298,
- 0x6650, 0x122c, 0x2a74, 0x4e08, 0x7e98,
- 0x7250, 0x1e2c, 0x3674, 0x5a08, 0x0698,
- 0x7e50, 0x2a2c, 0x4274, 0x6608, 0x1298,
- 0x0650, 0x362c, 0x4e74, 0x7208, 0x1e98,
- 0x1250, 0x422c, 0x5a74, 0x7e08, 0x2a98,
- 0x1e52, 0x4e2e, 0x6676, 0x060a, 0x369a,
- 0x2a52, 0x5a2e, 0x7276, 0x120a, 0x429a,
- 0x3652, 0x662e, 0x7e76, 0x1e0a, 0x4e9a,
- 0x4252, 0x722e, 0x0676, 0x2a0a, 0x5a9a,
- 0x4e52, 0x7e2e, 0x1276, 0x360a, 0x669a,
- 0x5a52, 0x062e, 0x1e76, 0x420a, 0x729a,
- 0x6652, 0x122e, 0x2a76, 0x4e0a, 0x7e9a,
- 0x7252, 0x1e2e, 0x3676, 0x5a0a, 0x069a,
- 0x7e52, 0x2a2e, 0x4276, 0x660a, 0x129a,
- 0x0652, 0x362e, 0x4e76, 0x720a, 0x1e9a,
- 0x1252, 0x422e, 0x5a76, 0x7e0a, 0x2a9a,
- 0x1e54, 0x4e30, 0x6678, 0x060c, 0x369c,
- 0x2a54, 0x5a30, 0x7278, 0x120c, 0x429c,
- 0x3654, 0x6630, 0x7e78, 0x1e0c, 0x4e9c,
- 0x4254, 0x7230, 0x0678, 0x2a0c, 0x5a9c,
- 0x4e54, 0x7e30, 0x1278, 0x360c, 0x669c,
- 0x5a54, 0x0630, 0x1e78, 0x420c, 0x729c,
- 0x6654, 0x1230, 0x2a78, 0x4e0c, 0x7e9c,
- 0x7254, 0x1e30, 0x3678, 0x5a0c, 0x069c,
- 0x7e54, 0x2a30, 0x4278, 0x660c, 0x129c,
- 0x0654, 0x3630, 0x4e78, 0x720c, 0x1e9c,
- 0x1254, 0x4230, 0x5a78, 0x7e0c, 0x2a9c,
- 0x1e56, 0x4e32, 0x667a, 0x060e, 0x369e,
- 0x2a56, 0x5a32, 0x727a, 0x120e, 0x429e,
- 0x3656, 0x6632, 0x7e7a, 0x1e0e, 0x4e9e,
- 0x4256, 0x7232, 0x067a, 0x2a0e, 0x5a9e,
- 0x4e56, 0x7e32, 0x127a, 0x360e, 0x669e,
- 0x5a56, 0x0632, 0x1e7a, 0x420e, 0x729e,
- 0x6656, 0x1232, 0x2a7a, 0x4e0e, 0x7e9e,
- 0x7256, 0x1e32, 0x367a, 0x5a0e, 0x069e,
- 0x7e56, 0x2a32, 0x427a, 0x660e, 0x129e,
- 0x0656, 0x3632, 0x4e7a, 0x720e, 0x1e9e,
- 0x1256, 0x4232, 0x5a7a, 0x7e0e, 0x2a9e,
- 0x1e58, 0x4e34, 0x667c, 0x0610, 0x36a0,
- 0x2a58, 0x5a34, 0x727c, 0x1210, 0x42a0,
- 0x3658, 0x6634, 0x7e7c, 0x1e10, 0x4ea0,
- 0x4258, 0x7234, 0x067c, 0x2a10, 0x5aa0,
- 0x4e58, 0x7e34, 0x127c, 0x3610, 0x66a0,
- 0x5a58, 0x0634, 0x1e7c, 0x4210, 0x72a0,
- 0x6658, 0x1234, 0x2a7c, 0x4e10, 0x7ea0,
- 0x7258, 0x1e34, 0x367c, 0x5a10, 0x06a0,
- 0x7e58, 0x2a34, 0x427c, 0x6610, 0x12a0,
- 0x0658, 0x3634, 0x4e7c, 0x7210, 0x1ea0,
- 0x1258, 0x4234, 0x5a7c, 0x7e10, 0x2aa0,
- 0x2248, 0x5224, 0x6a6c, 0x0a00, 0x3a90,
- 0x2e48, 0x5e24, 0x766c, 0x1600, 0x4690,
- 0x3a48, 0x6a24, 0x826c, 0x2200, 0x5290,
- 0x4648, 0x7624, 0x0a6c, 0x2e00, 0x5e90,
- 0x5248, 0x8224, 0x166c, 0x3a00, 0x6a90,
- 0x5e48, 0x0a24, 0x226c, 0x4600, 0x7690,
- 0x6a48, 0x1624, 0x2e6c, 0x5200, 0x8290,
- 0x7648, 0x2224, 0x3a6c, 0x5e00, 0x0a90,
- 0x8248, 0x2e24, 0x466c, 0x6a00, 0x1690,
- 0x0a48, 0x3a24, 0x526c, 0x7600, 0x2290,
- 0x1648, 0x4624, 0x5e6c, 0x8200, 0x2e90,
- 0x224a, 0x5226, 0x6a6e, 0x0a02, 0x3a92,
- 0x2e4a, 0x5e26, 0x766e, 0x1602, 0x4692,
- 0x3a4a, 0x6a26, 0x826e, 0x2202, 0x5292,
- 0x464a, 0x7626, 0x0a6e, 0x2e02, 0x5e92,
- 0x524a, 0x8226, 0x166e, 0x3a02, 0x6a92,
- 0x5e4a, 0x0a26, 0x226e, 0x4602, 0x7692,
- 0x6a4a, 0x1626, 0x2e6e, 0x5202, 0x8292,
- 0x764a, 0x2226, 0x3a6e, 0x5e02, 0x0a92,
- 0x824a, 0x2e26, 0x466e, 0x6a02, 0x1692,
- 0x0a4a, 0x3a26, 0x526e, 0x7602, 0x2292,
- 0x164a, 0x4626, 0x5e6e, 0x8202, 0x2e92,
- 0x224c, 0x5228, 0x6a70, 0x0a04, 0x3a94,
- 0x2e4c, 0x5e28, 0x7670, 0x1604, 0x4694,
- 0x3a4c, 0x6a28, 0x8270, 0x2204, 0x5294,
- 0x464c, 0x7628, 0x0a70, 0x2e04, 0x5e94,
- 0x524c, 0x8228, 0x1670, 0x3a04, 0x6a94,
- 0x5e4c, 0x0a28, 0x2270, 0x4604, 0x7694,
- 0x6a4c, 0x1628, 0x2e70, 0x5204, 0x8294,
- 0x764c, 0x2228, 0x3a70, 0x5e04, 0x0a94,
- 0x824c, 0x2e28, 0x4670, 0x6a04, 0x1694,
- 0x0a4c, 0x3a28, 0x5270, 0x7604, 0x2294,
- 0x164c, 0x4628, 0x5e70, 0x8204, 0x2e94,
- 0x224e, 0x522a, 0x6a72, 0x0a06, 0x3a96,
- 0x2e4e, 0x5e2a, 0x7672, 0x1606, 0x4696,
- 0x3a4e, 0x6a2a, 0x8272, 0x2206, 0x5296,
- 0x464e, 0x762a, 0x0a72, 0x2e06, 0x5e96,
- 0x524e, 0x822a, 0x1672, 0x3a06, 0x6a96,
- 0x5e4e, 0x0a2a, 0x2272, 0x4606, 0x7696,
- 0x6a4e, 0x162a, 0x2e72, 0x5206, 0x8296,
- 0x764e, 0x222a, 0x3a72, 0x5e06, 0x0a96,
- 0x824e, 0x2e2a, 0x4672, 0x6a06, 0x1696,
- 0x0a4e, 0x3a2a, 0x5272, 0x7606, 0x2296,
- 0x164e, 0x462a, 0x5e72, 0x8206, 0x2e96,
- 0x2250, 0x522c, 0x6a74, 0x0a08, 0x3a98,
- 0x2e50, 0x5e2c, 0x7674, 0x1608, 0x4698,
- 0x3a50, 0x6a2c, 0x8274, 0x2208, 0x5298,
- 0x4650, 0x762c, 0x0a74, 0x2e08, 0x5e98,
- 0x5250, 0x822c, 0x1674, 0x3a08, 0x6a98,
- 0x5e50, 0x0a2c, 0x2274, 0x4608, 0x7698,
- 0x6a50, 0x162c, 0x2e74, 0x5208, 0x8298,
- 0x7650, 0x222c, 0x3a74, 0x5e08, 0x0a98,
- 0x8250, 0x2e2c, 0x4674, 0x6a08, 0x1698,
- 0x0a50, 0x3a2c, 0x5274, 0x7608, 0x2298,
- 0x1650, 0x462c, 0x5e74, 0x8208, 0x2e98,
- 0x2252, 0x522e, 0x6a76, 0x0a0a, 0x3a9a,
- 0x2e52, 0x5e2e, 0x7676, 0x160a, 0x469a,
- 0x3a52, 0x6a2e, 0x8276, 0x220a, 0x529a,
- 0x4652, 0x762e, 0x0a76, 0x2e0a, 0x5e9a,
- 0x5252, 0x822e, 0x1676, 0x3a0a, 0x6a9a,
- 0x5e52, 0x0a2e, 0x2276, 0x460a, 0x769a,
- 0x6a52, 0x162e, 0x2e76, 0x520a, 0x829a,
- 0x7652, 0x222e, 0x3a76, 0x5e0a, 0x0a9a,
- 0x8252, 0x2e2e, 0x4676, 0x6a0a, 0x169a,
- 0x0a52, 0x3a2e, 0x5276, 0x760a, 0x229a,
- 0x1652, 0x462e, 0x5e76, 0x820a, 0x2e9a,
- 0x2254, 0x5230, 0x6a78, 0x0a0c, 0x3a9c,
- 0x2e54, 0x5e30, 0x7678, 0x160c, 0x469c,
- 0x3a54, 0x6a30, 0x8278, 0x220c, 0x529c,
- 0x4654, 0x7630, 0x0a78, 0x2e0c, 0x5e9c,
- 0x5254, 0x8230, 0x1678, 0x3a0c, 0x6a9c,
- 0x5e54, 0x0a30, 0x2278, 0x460c, 0x769c,
- 0x6a54, 0x1630, 0x2e78, 0x520c, 0x829c,
- 0x7654, 0x2230, 0x3a78, 0x5e0c, 0x0a9c,
- 0x8254, 0x2e30, 0x4678, 0x6a0c, 0x169c,
- 0x0a54, 0x3a30, 0x5278, 0x760c, 0x229c,
- 0x1654, 0x4630, 0x5e78, 0x820c, 0x2e9c,
- 0x2256, 0x5232, 0x6a7a, 0x0a0e, 0x3a9e,
- 0x2e56, 0x5e32, 0x767a, 0x160e, 0x469e,
- 0x3a56, 0x6a32, 0x827a, 0x220e, 0x529e,
- 0x4656, 0x7632, 0x0a7a, 0x2e0e, 0x5e9e,
- 0x5256, 0x8232, 0x167a, 0x3a0e, 0x6a9e,
- 0x5e56, 0x0a32, 0x227a, 0x460e, 0x769e,
- 0x6a56, 0x1632, 0x2e7a, 0x520e, 0x829e,
- 0x7656, 0x2232, 0x3a7a, 0x5e0e, 0x0a9e,
- 0x8256, 0x2e32, 0x467a, 0x6a0e, 0x169e,
- 0x0a56, 0x3a32, 0x527a, 0x760e, 0x229e,
- 0x1656, 0x4632, 0x5e7a, 0x820e, 0x2e9e,
- 0x2258, 0x5234, 0x6a7c, 0x0a10, 0x3aa0,
- 0x2e58, 0x5e34, 0x767c, 0x1610, 0x46a0,
- 0x3a58, 0x6a34, 0x827c, 0x2210, 0x52a0,
- 0x4658, 0x7634, 0x0a7c, 0x2e10, 0x5ea0,
- 0x5258, 0x8234, 0x167c, 0x3a10, 0x6aa0,
- 0x5e58, 0x0a34, 0x227c, 0x4610, 0x76a0,
- 0x6a58, 0x1634, 0x2e7c, 0x5210, 0x82a0,
- 0x7658, 0x2234, 0x3a7c, 0x5e10, 0x0aa0,
- 0x8258, 0x2e34, 0x467c, 0x6a10, 0x16a0,
- 0x0a58, 0x3a34, 0x527c, 0x7610, 0x22a0,
- 0x1658, 0x4634, 0x5e7c, 0x8210, 0x2ea0,
- 0x0000, 0x0036, 0x006c, 0x00a2, 0x8648,
- 0x0002, 0x0038, 0x006e, 0x00a4, 0x864c,
- 0x0004, 0x003a, 0x0070, 0x00a6, 0x8650,
- 0x0006, 0x003c, 0x0072, 0x00a8, 0x8654,
- 0x0008, 0x003e, 0x0074, 0x00aa, 0x8658,
- 0x000a, 0x0040, 0x0076, 0x00ac, 0x865c,
- 0x000c, 0x0042, 0x0078, 0x00ae, 0x8660,
- 0x000e, 0x0044, 0x007a, 0x00b0, 0x8664,
- 0x0010, 0x0046, 0x007c, 0x00b2, 0x8668,
- 0x0012, 0x0048, 0x007e, 0x8600, 0x866c,
- 0x0014, 0x004a, 0x0080, 0x8604, 0x8670,
- 0x0016, 0x004c, 0x0082, 0x8608, 0x8674,
- 0x0018, 0x004e, 0x0084, 0x860c, 0x8678,
- 0x001a, 0x0050, 0x0086, 0x8610, 0x867c,
- 0x001c, 0x0052, 0x0088, 0x8614, 0x8680,
- 0x001e, 0x0054, 0x008a, 0x8618, 0x8684,
- 0x0020, 0x0056, 0x008c, 0x861c, 0x8688,
- 0x0022, 0x0058, 0x008e, 0x8620, 0x868c,
- 0x0024, 0x005a, 0x0090, 0x8624, 0x8690,
- 0x0026, 0x005c, 0x0092, 0x8628, 0x8694,
- 0x0028, 0x005e, 0x0094, 0x862c, 0x8698,
- 0x002a, 0x0060, 0x0096, 0x8630, 0x869c,
- 0x002c, 0x0062, 0x0098, 0x8634, 0x86a0,
- 0x002e, 0x0064, 0x009a, 0x8638, 0x86a4,
- 0x0030, 0x0066, 0x009c, 0x863c, 0x86a8,
- 0x0032, 0x0068, 0x009e, 0x8640, 0x86ac,
- 0x0034, 0x006a, 0x00a0, 0x8644, 0x86b0,
- 0x4a5a, 0x7a36, 0x0e7e, 0x3212, 0x62a2,
- 0x565a, 0x0236, 0x1a7e, 0x3e12, 0x6ea2,
- 0x625a, 0x0e36, 0x267e, 0x4a12, 0x7aa2,
- 0x6e5a, 0x1a36, 0x327e, 0x5612, 0x02a2,
- 0x7a5a, 0x2636, 0x3e7e, 0x6212, 0x0ea2,
- 0x025a, 0x3236, 0x4a7e, 0x6e12, 0x1aa2,
- 0x0e5a, 0x3e36, 0x567e, 0x7a12, 0x26a2,
- 0x1a5a, 0x4a36, 0x627e, 0x0212, 0x32a2,
- 0x265a, 0x5636, 0x6e7e, 0x0e12, 0x3ea2,
- 0x325a, 0x6236, 0x7a7e, 0x1a12, 0x4aa2,
- 0x3e5a, 0x6e36, 0x027e, 0x2612, 0x56a2,
- 0x4a5c, 0x7a38, 0x0e80, 0x3214, 0x62a4,
- 0x565c, 0x0238, 0x1a80, 0x3e14, 0x6ea4,
- 0x625c, 0x0e38, 0x2680, 0x4a14, 0x7aa4,
- 0x6e5c, 0x1a38, 0x3280, 0x5614, 0x02a4,
- 0x7a5c, 0x2638, 0x3e80, 0x6214, 0x0ea4,
- 0x025c, 0x3238, 0x4a80, 0x6e14, 0x1aa4,
- 0x0e5c, 0x3e38, 0x5680, 0x7a14, 0x26a4,
- 0x1a5c, 0x4a38, 0x6280, 0x0214, 0x32a4,
- 0x265c, 0x5638, 0x6e80, 0x0e14, 0x3ea4,
- 0x325c, 0x6238, 0x7a80, 0x1a14, 0x4aa4,
- 0x3e5c, 0x6e38, 0x0280, 0x2614, 0x56a4,
- 0x4a5e, 0x7a3a, 0x0e82, 0x3216, 0x62a6,
- 0x565e, 0x023a, 0x1a82, 0x3e16, 0x6ea6,
- 0x625e, 0x0e3a, 0x2682, 0x4a16, 0x7aa6,
- 0x6e5e, 0x1a3a, 0x3282, 0x5616, 0x02a6,
- 0x7a5e, 0x263a, 0x3e82, 0x6216, 0x0ea6,
- 0x025e, 0x323a, 0x4a82, 0x6e16, 0x1aa6,
- 0x0e5e, 0x3e3a, 0x5682, 0x7a16, 0x26a6,
- 0x1a5e, 0x4a3a, 0x6282, 0x0216, 0x32a6,
- 0x265e, 0x563a, 0x6e82, 0x0e16, 0x3ea6,
- 0x325e, 0x623a, 0x7a82, 0x1a16, 0x4aa6,
- 0x3e5e, 0x6e3a, 0x0282, 0x2616, 0x56a6,
- 0x4a60, 0x7a3c, 0x0e84, 0x3218, 0x62a8,
- 0x5660, 0x023c, 0x1a84, 0x3e18, 0x6ea8,
- 0x6260, 0x0e3c, 0x2684, 0x4a18, 0x7aa8,
- 0x6e60, 0x1a3c, 0x3284, 0x5618, 0x02a8,
- 0x7a60, 0x263c, 0x3e84, 0x6218, 0x0ea8,
- 0x0260, 0x323c, 0x4a84, 0x6e18, 0x1aa8,
- 0x0e60, 0x3e3c, 0x5684, 0x7a18, 0x26a8,
- 0x1a60, 0x4a3c, 0x6284, 0x0218, 0x32a8,
- 0x2660, 0x563c, 0x6e84, 0x0e18, 0x3ea8,
- 0x3260, 0x623c, 0x7a84, 0x1a18, 0x4aa8,
- 0x3e60, 0x6e3c, 0x0284, 0x2618, 0x56a8,
- 0x4a62, 0x7a3e, 0x0e86, 0x321a, 0x62aa,
- 0x5662, 0x023e, 0x1a86, 0x3e1a, 0x6eaa,
- 0x6262, 0x0e3e, 0x2686, 0x4a1a, 0x7aaa,
- 0x6e62, 0x1a3e, 0x3286, 0x561a, 0x02aa,
- 0x7a62, 0x263e, 0x3e86, 0x621a, 0x0eaa,
- 0x0262, 0x323e, 0x4a86, 0x6e1a, 0x1aaa,
- 0x0e62, 0x3e3e, 0x5686, 0x7a1a, 0x26aa,
- 0x1a62, 0x4a3e, 0x6286, 0x021a, 0x32aa,
- 0x2662, 0x563e, 0x6e86, 0x0e1a, 0x3eaa,
- 0x3262, 0x623e, 0x7a86, 0x1a1a, 0x4aaa,
- 0x3e62, 0x6e3e, 0x0286, 0x261a, 0x56aa,
- 0x4a64, 0x7a40, 0x0e88, 0x321c, 0x62ac,
- 0x5664, 0x0240, 0x1a88, 0x3e1c, 0x6eac,
- 0x6264, 0x0e40, 0x2688, 0x4a1c, 0x7aac,
- 0x6e64, 0x1a40, 0x3288, 0x561c, 0x02ac,
- 0x7a64, 0x2640, 0x3e88, 0x621c, 0x0eac,
- 0x0264, 0x3240, 0x4a88, 0x6e1c, 0x1aac,
- 0x0e64, 0x3e40, 0x5688, 0x7a1c, 0x26ac,
- 0x1a64, 0x4a40, 0x6288, 0x021c, 0x32ac,
- 0x2664, 0x5640, 0x6e88, 0x0e1c, 0x3eac,
- 0x3264, 0x6240, 0x7a88, 0x1a1c, 0x4aac,
- 0x3e64, 0x6e40, 0x0288, 0x261c, 0x56ac,
- 0x4a66, 0x7a42, 0x0e8a, 0x321e, 0x62ae,
- 0x5666, 0x0242, 0x1a8a, 0x3e1e, 0x6eae,
- 0x6266, 0x0e42, 0x268a, 0x4a1e, 0x7aae,
- 0x6e66, 0x1a42, 0x328a, 0x561e, 0x02ae,
- 0x7a66, 0x2642, 0x3e8a, 0x621e, 0x0eae,
- 0x0266, 0x3242, 0x4a8a, 0x6e1e, 0x1aae,
- 0x0e66, 0x3e42, 0x568a, 0x7a1e, 0x26ae,
- 0x1a66, 0x4a42, 0x628a, 0x021e, 0x32ae,
- 0x2666, 0x5642, 0x6e8a, 0x0e1e, 0x3eae,
- 0x3266, 0x6242, 0x7a8a, 0x1a1e, 0x4aae,
- 0x3e66, 0x6e42, 0x028a, 0x261e, 0x56ae,
- 0x4a68, 0x7a44, 0x0e8c, 0x3220, 0x62b0,
- 0x5668, 0x0244, 0x1a8c, 0x3e20, 0x6eb0,
- 0x6268, 0x0e44, 0x268c, 0x4a20, 0x7ab0,
- 0x6e68, 0x1a44, 0x328c, 0x5620, 0x02b0,
- 0x7a68, 0x2644, 0x3e8c, 0x6220, 0x0eb0,
- 0x0268, 0x3244, 0x4a8c, 0x6e20, 0x1ab0,
- 0x0e68, 0x3e44, 0x568c, 0x7a20, 0x26b0,
- 0x1a68, 0x4a44, 0x628c, 0x0220, 0x32b0,
- 0x2668, 0x5644, 0x6e8c, 0x0e20, 0x3eb0,
- 0x3268, 0x6244, 0x7a8c, 0x1a20, 0x4ab0,
- 0x3e68, 0x6e44, 0x028c, 0x2620, 0x56b0,
- 0x4a6a, 0x7a46, 0x0e8e, 0x3222, 0x62b2,
- 0x566a, 0x0246, 0x1a8e, 0x3e22, 0x6eb2,
- 0x626a, 0x0e46, 0x268e, 0x4a22, 0x7ab2,
- 0x6e6a, 0x1a46, 0x328e, 0x5622, 0x02b2,
- 0x7a6a, 0x2646, 0x3e8e, 0x6222, 0x0eb2,
- 0x026a, 0x3246, 0x4a8e, 0x6e22, 0x1ab2,
- 0x0e6a, 0x3e46, 0x568e, 0x7a22, 0x26b2,
- 0x1a6a, 0x4a46, 0x628e, 0x0222, 0x32b2,
- 0x266a, 0x5646, 0x6e8e, 0x0e22, 0x3eb2,
- 0x326a, 0x6246, 0x7a8e, 0x1a22, 0x4ab2,
- 0x3e6a, 0x6e46, 0x028e, 0x2622, 0x56b2,
- 0x4e5a, 0x7e36, 0x127e, 0x3612, 0x66a2,
- 0x5a5a, 0x0636, 0x1e7e, 0x4212, 0x72a2,
- 0x665a, 0x1236, 0x2a7e, 0x4e12, 0x7ea2,
- 0x725a, 0x1e36, 0x367e, 0x5a12, 0x06a2,
- 0x7e5a, 0x2a36, 0x427e, 0x6612, 0x12a2,
- 0x065a, 0x3636, 0x4e7e, 0x7212, 0x1ea2,
- 0x125a, 0x4236, 0x5a7e, 0x7e12, 0x2aa2,
- 0x1e5a, 0x4e36, 0x667e, 0x0612, 0x36a2,
- 0x2a5a, 0x5a36, 0x727e, 0x1212, 0x42a2,
- 0x365a, 0x6636, 0x7e7e, 0x1e12, 0x4ea2,
- 0x425a, 0x7236, 0x067e, 0x2a12, 0x5aa2,
- 0x4e5c, 0x7e38, 0x1280, 0x3614, 0x66a4,
- 0x5a5c, 0x0638, 0x1e80, 0x4214, 0x72a4,
- 0x665c, 0x1238, 0x2a80, 0x4e14, 0x7ea4,
- 0x725c, 0x1e38, 0x3680, 0x5a14, 0x06a4,
- 0x7e5c, 0x2a38, 0x4280, 0x6614, 0x12a4,
- 0x065c, 0x3638, 0x4e80, 0x7214, 0x1ea4,
- 0x125c, 0x4238, 0x5a80, 0x7e14, 0x2aa4,
- 0x1e5c, 0x4e38, 0x6680, 0x0614, 0x36a4,
- 0x2a5c, 0x5a38, 0x7280, 0x1214, 0x42a4,
- 0x365c, 0x6638, 0x7e80, 0x1e14, 0x4ea4,
- 0x425c, 0x7238, 0x0680, 0x2a14, 0x5aa4,
- 0x4e5e, 0x7e3a, 0x1282, 0x3616, 0x66a6,
- 0x5a5e, 0x063a, 0x1e82, 0x4216, 0x72a6,
- 0x665e, 0x123a, 0x2a82, 0x4e16, 0x7ea6,
- 0x725e, 0x1e3a, 0x3682, 0x5a16, 0x06a6,
- 0x7e5e, 0x2a3a, 0x4282, 0x6616, 0x12a6,
- 0x065e, 0x363a, 0x4e82, 0x7216, 0x1ea6,
- 0x125e, 0x423a, 0x5a82, 0x7e16, 0x2aa6,
- 0x1e5e, 0x4e3a, 0x6682, 0x0616, 0x36a6,
- 0x2a5e, 0x5a3a, 0x7282, 0x1216, 0x42a6,
- 0x365e, 0x663a, 0x7e82, 0x1e16, 0x4ea6,
- 0x425e, 0x723a, 0x0682, 0x2a16, 0x5aa6,
- 0x4e60, 0x7e3c, 0x1284, 0x3618, 0x66a8,
- 0x5a60, 0x063c, 0x1e84, 0x4218, 0x72a8,
- 0x6660, 0x123c, 0x2a84, 0x4e18, 0x7ea8,
- 0x7260, 0x1e3c, 0x3684, 0x5a18, 0x06a8,
- 0x7e60, 0x2a3c, 0x4284, 0x6618, 0x12a8,
- 0x0660, 0x363c, 0x4e84, 0x7218, 0x1ea8,
- 0x1260, 0x423c, 0x5a84, 0x7e18, 0x2aa8,
- 0x1e60, 0x4e3c, 0x6684, 0x0618, 0x36a8,
- 0x2a60, 0x5a3c, 0x7284, 0x1218, 0x42a8,
- 0x3660, 0x663c, 0x7e84, 0x1e18, 0x4ea8,
- 0x4260, 0x723c, 0x0684, 0x2a18, 0x5aa8,
- 0x4e62, 0x7e3e, 0x1286, 0x361a, 0x66aa,
- 0x5a62, 0x063e, 0x1e86, 0x421a, 0x72aa,
- 0x6662, 0x123e, 0x2a86, 0x4e1a, 0x7eaa,
- 0x7262, 0x1e3e, 0x3686, 0x5a1a, 0x06aa,
- 0x7e62, 0x2a3e, 0x4286, 0x661a, 0x12aa,
- 0x0662, 0x363e, 0x4e86, 0x721a, 0x1eaa,
- 0x1262, 0x423e, 0x5a86, 0x7e1a, 0x2aaa,
- 0x1e62, 0x4e3e, 0x6686, 0x061a, 0x36aa,
- 0x2a62, 0x5a3e, 0x7286, 0x121a, 0x42aa,
- 0x3662, 0x663e, 0x7e86, 0x1e1a, 0x4eaa,
- 0x4262, 0x723e, 0x0686, 0x2a1a, 0x5aaa,
- 0x4e64, 0x7e40, 0x1288, 0x361c, 0x66ac,
- 0x5a64, 0x0640, 0x1e88, 0x421c, 0x72ac,
- 0x6664, 0x1240, 0x2a88, 0x4e1c, 0x7eac,
- 0x7264, 0x1e40, 0x3688, 0x5a1c, 0x06ac,
- 0x7e64, 0x2a40, 0x4288, 0x661c, 0x12ac,
- 0x0664, 0x3640, 0x4e88, 0x721c, 0x1eac,
- 0x1264, 0x4240, 0x5a88, 0x7e1c, 0x2aac,
- 0x1e64, 0x4e40, 0x6688, 0x061c, 0x36ac,
- 0x2a64, 0x5a40, 0x7288, 0x121c, 0x42ac,
- 0x3664, 0x6640, 0x7e88, 0x1e1c, 0x4eac,
- 0x4264, 0x7240, 0x0688, 0x2a1c, 0x5aac,
- 0x4e66, 0x7e42, 0x128a, 0x361e, 0x66ae,
- 0x5a66, 0x0642, 0x1e8a, 0x421e, 0x72ae,
- 0x6666, 0x1242, 0x2a8a, 0x4e1e, 0x7eae,
- 0x7266, 0x1e42, 0x368a, 0x5a1e, 0x06ae,
- 0x7e66, 0x2a42, 0x428a, 0x661e, 0x12ae,
- 0x0666, 0x3642, 0x4e8a, 0x721e, 0x1eae,
- 0x1266, 0x4242, 0x5a8a, 0x7e1e, 0x2aae,
- 0x1e66, 0x4e42, 0x668a, 0x061e, 0x36ae,
- 0x2a66, 0x5a42, 0x728a, 0x121e, 0x42ae,
- 0x3666, 0x6642, 0x7e8a, 0x1e1e, 0x4eae,
- 0x4266, 0x7242, 0x068a, 0x2a1e, 0x5aae,
- 0x4e68, 0x7e44, 0x128c, 0x3620, 0x66b0,
- 0x5a68, 0x0644, 0x1e8c, 0x4220, 0x72b0,
- 0x6668, 0x1244, 0x2a8c, 0x4e20, 0x7eb0,
- 0x7268, 0x1e44, 0x368c, 0x5a20, 0x06b0,
- 0x7e68, 0x2a44, 0x428c, 0x6620, 0x12b0,
- 0x0668, 0x3644, 0x4e8c, 0x7220, 0x1eb0,
- 0x1268, 0x4244, 0x5a8c, 0x7e20, 0x2ab0,
- 0x1e68, 0x4e44, 0x668c, 0x0620, 0x36b0,
- 0x2a68, 0x5a44, 0x728c, 0x1220, 0x42b0,
- 0x3668, 0x6644, 0x7e8c, 0x1e20, 0x4eb0,
- 0x4268, 0x7244, 0x068c, 0x2a20, 0x5ab0,
- 0x4e6a, 0x7e46, 0x128e, 0x3622, 0x66b2,
- 0x5a6a, 0x0646, 0x1e8e, 0x4222, 0x72b2,
- 0x666a, 0x1246, 0x2a8e, 0x4e22, 0x7eb2,
- 0x726a, 0x1e46, 0x368e, 0x5a22, 0x06b2,
- 0x7e6a, 0x2a46, 0x428e, 0x6622, 0x12b2,
- 0x066a, 0x3646, 0x4e8e, 0x7222, 0x1eb2,
- 0x126a, 0x4246, 0x5a8e, 0x7e22, 0x2ab2,
- 0x1e6a, 0x4e46, 0x668e, 0x0622, 0x36b2,
- 0x2a6a, 0x5a46, 0x728e, 0x1222, 0x42b2,
- 0x366a, 0x6646, 0x7e8e, 0x1e22, 0x4eb2,
- 0x426a, 0x7246, 0x068e, 0x2a22, 0x5ab2,
- 0x525a, 0x8236, 0x167e, 0x3a12, 0x6aa2,
- 0x5e5a, 0x0a36, 0x227e, 0x4612, 0x76a2,
- 0x6a5a, 0x1636, 0x2e7e, 0x5212, 0x82a2,
- 0x765a, 0x2236, 0x3a7e, 0x5e12, 0x0aa2,
- 0x825a, 0x2e36, 0x467e, 0x6a12, 0x16a2,
- 0x0a5a, 0x3a36, 0x527e, 0x7612, 0x22a2,
- 0x165a, 0x4636, 0x5e7e, 0x8212, 0x2ea2,
- 0x225a, 0x5236, 0x6a7e, 0x0a12, 0x3aa2,
- 0x2e5a, 0x5e36, 0x767e, 0x1612, 0x46a2,
- 0x3a5a, 0x6a36, 0x827e, 0x2212, 0x52a2,
- 0x465a, 0x7636, 0x0a7e, 0x2e12, 0x5ea2,
- 0x525c, 0x8238, 0x1680, 0x3a14, 0x6aa4,
- 0x5e5c, 0x0a38, 0x2280, 0x4614, 0x76a4,
- 0x6a5c, 0x1638, 0x2e80, 0x5214, 0x82a4,
- 0x765c, 0x2238, 0x3a80, 0x5e14, 0x0aa4,
- 0x825c, 0x2e38, 0x4680, 0x6a14, 0x16a4,
- 0x0a5c, 0x3a38, 0x5280, 0x7614, 0x22a4,
- 0x165c, 0x4638, 0x5e80, 0x8214, 0x2ea4,
- 0x225c, 0x5238, 0x6a80, 0x0a14, 0x3aa4,
- 0x2e5c, 0x5e38, 0x7680, 0x1614, 0x46a4,
- 0x3a5c, 0x6a38, 0x8280, 0x2214, 0x52a4,
- 0x465c, 0x7638, 0x0a80, 0x2e14, 0x5ea4,
- 0x525e, 0x823a, 0x1682, 0x3a16, 0x6aa6,
- 0x5e5e, 0x0a3a, 0x2282, 0x4616, 0x76a6,
- 0x6a5e, 0x163a, 0x2e82, 0x5216, 0x82a6,
- 0x765e, 0x223a, 0x3a82, 0x5e16, 0x0aa6,
- 0x825e, 0x2e3a, 0x4682, 0x6a16, 0x16a6,
- 0x0a5e, 0x3a3a, 0x5282, 0x7616, 0x22a6,
- 0x165e, 0x463a, 0x5e82, 0x8216, 0x2ea6,
- 0x225e, 0x523a, 0x6a82, 0x0a16, 0x3aa6,
- 0x2e5e, 0x5e3a, 0x7682, 0x1616, 0x46a6,
- 0x3a5e, 0x6a3a, 0x8282, 0x2216, 0x52a6,
- 0x465e, 0x763a, 0x0a82, 0x2e16, 0x5ea6,
- 0x5260, 0x823c, 0x1684, 0x3a18, 0x6aa8,
- 0x5e60, 0x0a3c, 0x2284, 0x4618, 0x76a8,
- 0x6a60, 0x163c, 0x2e84, 0x5218, 0x82a8,
- 0x7660, 0x223c, 0x3a84, 0x5e18, 0x0aa8,
- 0x8260, 0x2e3c, 0x4684, 0x6a18, 0x16a8,
- 0x0a60, 0x3a3c, 0x5284, 0x7618, 0x22a8,
- 0x1660, 0x463c, 0x5e84, 0x8218, 0x2ea8,
- 0x2260, 0x523c, 0x6a84, 0x0a18, 0x3aa8,
- 0x2e60, 0x5e3c, 0x7684, 0x1618, 0x46a8,
- 0x3a60, 0x6a3c, 0x8284, 0x2218, 0x52a8,
- 0x4660, 0x763c, 0x0a84, 0x2e18, 0x5ea8,
- 0x5262, 0x823e, 0x1686, 0x3a1a, 0x6aaa,
- 0x5e62, 0x0a3e, 0x2286, 0x461a, 0x76aa,
- 0x6a62, 0x163e, 0x2e86, 0x521a, 0x82aa,
- 0x7662, 0x223e, 0x3a86, 0x5e1a, 0x0aaa,
- 0x8262, 0x2e3e, 0x4686, 0x6a1a, 0x16aa,
- 0x0a62, 0x3a3e, 0x5286, 0x761a, 0x22aa,
- 0x1662, 0x463e, 0x5e86, 0x821a, 0x2eaa,
- 0x2262, 0x523e, 0x6a86, 0x0a1a, 0x3aaa,
- 0x2e62, 0x5e3e, 0x7686, 0x161a, 0x46aa,
- 0x3a62, 0x6a3e, 0x8286, 0x221a, 0x52aa,
- 0x4662, 0x763e, 0x0a86, 0x2e1a, 0x5eaa,
- 0x5264, 0x8240, 0x1688, 0x3a1c, 0x6aac,
- 0x5e64, 0x0a40, 0x2288, 0x461c, 0x76ac,
- 0x6a64, 0x1640, 0x2e88, 0x521c, 0x82ac,
- 0x7664, 0x2240, 0x3a88, 0x5e1c, 0x0aac,
- 0x8264, 0x2e40, 0x4688, 0x6a1c, 0x16ac,
- 0x0a64, 0x3a40, 0x5288, 0x761c, 0x22ac,
- 0x1664, 0x4640, 0x5e88, 0x821c, 0x2eac,
- 0x2264, 0x5240, 0x6a88, 0x0a1c, 0x3aac,
- 0x2e64, 0x5e40, 0x7688, 0x161c, 0x46ac,
- 0x3a64, 0x6a40, 0x8288, 0x221c, 0x52ac,
- 0x4664, 0x7640, 0x0a88, 0x2e1c, 0x5eac,
- 0x5266, 0x8242, 0x168a, 0x3a1e, 0x6aae,
- 0x5e66, 0x0a42, 0x228a, 0x461e, 0x76ae,
- 0x6a66, 0x1642, 0x2e8a, 0x521e, 0x82ae,
- 0x7666, 0x2242, 0x3a8a, 0x5e1e, 0x0aae,
- 0x8266, 0x2e42, 0x468a, 0x6a1e, 0x16ae,
- 0x0a66, 0x3a42, 0x528a, 0x761e, 0x22ae,
- 0x1666, 0x4642, 0x5e8a, 0x821e, 0x2eae,
- 0x2266, 0x5242, 0x6a8a, 0x0a1e, 0x3aae,
- 0x2e66, 0x5e42, 0x768a, 0x161e, 0x46ae,
- 0x3a66, 0x6a42, 0x828a, 0x221e, 0x52ae,
- 0x4666, 0x7642, 0x0a8a, 0x2e1e, 0x5eae,
- 0x5268, 0x8244, 0x168c, 0x3a20, 0x6ab0,
- 0x5e68, 0x0a44, 0x228c, 0x4620, 0x76b0,
- 0x6a68, 0x1644, 0x2e8c, 0x5220, 0x82b0,
- 0x7668, 0x2244, 0x3a8c, 0x5e20, 0x0ab0,
- 0x8268, 0x2e44, 0x468c, 0x6a20, 0x16b0,
- 0x0a68, 0x3a44, 0x528c, 0x7620, 0x22b0,
- 0x1668, 0x4644, 0x5e8c, 0x8220, 0x2eb0,
- 0x2268, 0x5244, 0x6a8c, 0x0a20, 0x3ab0,
- 0x2e68, 0x5e44, 0x768c, 0x1620, 0x46b0,
- 0x3a68, 0x6a44, 0x828c, 0x2220, 0x52b0,
- 0x4668, 0x7644, 0x0a8c, 0x2e20, 0x5eb0,
- 0x526a, 0x8246, 0x168e, 0x3a22, 0x6ab2,
- 0x5e6a, 0x0a46, 0x228e, 0x4622, 0x76b2,
- 0x6a6a, 0x1646, 0x2e8e, 0x5222, 0x82b2,
- 0x766a, 0x2246, 0x3a8e, 0x5e22, 0x0ab2,
- 0x826a, 0x2e46, 0x468e, 0x6a22, 0x16b2,
- 0x0a6a, 0x3a46, 0x528e, 0x7622, 0x22b2,
- 0x166a, 0x4646, 0x5e8e, 0x8222, 0x2eb2,
- 0x226a, 0x5246, 0x6a8e, 0x0a22, 0x3ab2,
- 0x2e6a, 0x5e46, 0x768e, 0x1622, 0x46b2,
- 0x3a6a, 0x6a46, 0x828e, 0x2222, 0x52b2,
- 0x466a, 0x7646, 0x0a8e, 0x2e22, 0x5eb2,
- 0x4a5a, 0x7a36, 0x0e7e, 0x3212, 0x62a2,
- 0x565a, 0x0236, 0x1a7e, 0x3e12, 0x6ea2,
- 0x625a, 0x0e36, 0x267e, 0x4a12, 0x7aa2,
- 0x6e5a, 0x1a36, 0x327e, 0x5612, 0x02a2,
- 0x7a5a, 0x2636, 0x3e7e, 0x6212, 0x0ea2,
- 0x025a, 0x3236, 0x4a7e, 0x6e12, 0x1aa2,
- 0x0e5a, 0x3e36, 0x567e, 0x7a12, 0x26a2,
- 0x1a5a, 0x4a36, 0x627e, 0x0212, 0x32a2,
- 0x265a, 0x5636, 0x6e7e, 0x0e12, 0x3ea2,
- 0x325a, 0x6236, 0x7a7e, 0x1a12, 0x4aa2,
- 0x3e5a, 0x6e36, 0x027e, 0x2612, 0x56a2,
- 0x4a5c, 0x7a38, 0x0e80, 0x3214, 0x62a4,
- 0x565c, 0x0238, 0x1a80, 0x3e14, 0x6ea4,
- 0x625c, 0x0e38, 0x2680, 0x4a14, 0x7aa4,
- 0x6e5c, 0x1a38, 0x3280, 0x5614, 0x02a4,
- 0x7a5c, 0x2638, 0x3e80, 0x6214, 0x0ea4,
- 0x025c, 0x3238, 0x4a80, 0x6e14, 0x1aa4,
- 0x0e5c, 0x3e38, 0x5680, 0x7a14, 0x26a4,
- 0x1a5c, 0x4a38, 0x6280, 0x0214, 0x32a4,
- 0x265c, 0x5638, 0x6e80, 0x0e14, 0x3ea4,
- 0x325c, 0x6238, 0x7a80, 0x1a14, 0x4aa4,
- 0x3e5c, 0x6e38, 0x0280, 0x2614, 0x56a4,
- 0x4a5e, 0x7a3a, 0x0e82, 0x3216, 0x62a6,
- 0x565e, 0x023a, 0x1a82, 0x3e16, 0x6ea6,
- 0x625e, 0x0e3a, 0x2682, 0x4a16, 0x7aa6,
- 0x6e5e, 0x1a3a, 0x3282, 0x5616, 0x02a6,
- 0x7a5e, 0x263a, 0x3e82, 0x6216, 0x0ea6,
- 0x7c48, 0x2824, 0x406c, 0x6400, 0x1090,
- 0x0448, 0x3424, 0x4c6c, 0x7000, 0x1c90,
- 0x1048, 0x4024, 0x586c, 0x7c00, 0x2890,
- 0x1c48, 0x4c24, 0x646c, 0x0400, 0x3490,
- 0x2848, 0x5824, 0x706c, 0x1000, 0x4090,
- 0x3448, 0x6424, 0x7c6c, 0x1c00, 0x4c90,
- 0x4048, 0x7024, 0x046c, 0x2800, 0x5890,
- 0x4c48, 0x7c24, 0x106c, 0x3400, 0x6490,
- 0x5848, 0x0424, 0x1c6c, 0x4000, 0x7090,
- 0x6448, 0x1024, 0x286c, 0x4c00, 0x7c90,
- 0x7048, 0x1c24, 0x346c, 0x5800, 0x0490,
- 0x7c4a, 0x2826, 0x406e, 0x6402, 0x1092,
- 0x044a, 0x3426, 0x4c6e, 0x7002, 0x1c92,
- 0x104a, 0x4026, 0x586e, 0x7c02, 0x2892,
- 0x1c4a, 0x4c26, 0x646e, 0x0402, 0x3492,
- 0x284a, 0x5826, 0x706e, 0x1002, 0x4092,
- 0x344a, 0x6426, 0x7c6e, 0x1c02, 0x4c92,
- 0x404a, 0x7026, 0x046e, 0x2802, 0x5892,
- 0x4c4a, 0x7c26, 0x106e, 0x3402, 0x6492,
- 0x584a, 0x0426, 0x1c6e, 0x4002, 0x7092,
- 0x644a, 0x1026, 0x286e, 0x4c02, 0x7c92,
- 0x704a, 0x1c26, 0x346e, 0x5802, 0x0492,
- 0x7c4c, 0x2828, 0x4070, 0x6404, 0x1094,
- 0x044c, 0x3428, 0x4c70, 0x7004, 0x1c94,
- 0x104c, 0x4028, 0x5870, 0x7c04, 0x2894,
- 0x1c4c, 0x4c28, 0x6470, 0x0404, 0x3494,
- 0x284c, 0x5828, 0x7070, 0x1004, 0x4094,
- 0x344c, 0x6428, 0x7c70, 0x1c04, 0x4c94,
- 0x404c, 0x7028, 0x0470, 0x2804, 0x5894,
- 0x4c4c, 0x7c28, 0x1070, 0x3404, 0x6494,
- 0x584c, 0x0428, 0x1c70, 0x4004, 0x7094,
- 0x644c, 0x1028, 0x2870, 0x4c04, 0x7c94,
- 0x704c, 0x1c28, 0x3470, 0x5804, 0x0494,
- 0x7c4e, 0x282a, 0x4072, 0x6406, 0x1096,
- 0x044e, 0x342a, 0x4c72, 0x7006, 0x1c96,
- 0x104e, 0x402a, 0x5872, 0x7c06, 0x2896,
- 0x1c4e, 0x4c2a, 0x6472, 0x0406, 0x3496,
- 0x284e, 0x582a, 0x7072, 0x1006, 0x4096,
- 0x344e, 0x642a, 0x7c72, 0x1c06, 0x4c96,
- 0x404e, 0x702a, 0x0472, 0x2806, 0x5896,
- 0x4c4e, 0x7c2a, 0x1072, 0x3406, 0x6496,
- 0x584e, 0x042a, 0x1c72, 0x4006, 0x7096,
- 0x644e, 0x102a, 0x2872, 0x4c06, 0x7c96,
- 0x704e, 0x1c2a, 0x3472, 0x5806, 0x0496,
- 0x7c50, 0x282c, 0x4074, 0x6408, 0x1098,
- 0x0450, 0x342c, 0x4c74, 0x7008, 0x1c98,
- 0x1050, 0x402c, 0x5874, 0x7c08, 0x2898,
- 0x1c50, 0x4c2c, 0x6474, 0x0408, 0x3498,
- 0x2850, 0x582c, 0x7074, 0x1008, 0x4098,
- 0x3450, 0x642c, 0x7c74, 0x1c08, 0x4c98,
- 0x4050, 0x702c, 0x0474, 0x2808, 0x5898,
- 0x4c50, 0x7c2c, 0x1074, 0x3408, 0x6498,
- 0x5850, 0x042c, 0x1c74, 0x4008, 0x7098,
- 0x6450, 0x102c, 0x2874, 0x4c08, 0x7c98,
- 0x7050, 0x1c2c, 0x3474, 0x5808, 0x0498,
- 0x7c52, 0x282e, 0x4076, 0x640a, 0x109a,
- 0x0452, 0x342e, 0x4c76, 0x700a, 0x1c9a,
- 0x1052, 0x402e, 0x5876, 0x7c0a, 0x289a,
- 0x1c52, 0x4c2e, 0x6476, 0x040a, 0x349a,
- 0x2852, 0x582e, 0x7076, 0x100a, 0x409a,
- 0x3452, 0x642e, 0x7c76, 0x1c0a, 0x4c9a,
- 0x4052, 0x702e, 0x0476, 0x280a, 0x589a,
- 0x4c52, 0x7c2e, 0x1076, 0x340a, 0x649a,
- 0x5852, 0x042e, 0x1c76, 0x400a, 0x709a,
- 0x6452, 0x102e, 0x2876, 0x4c0a, 0x7c9a,
- 0x7052, 0x1c2e, 0x3476, 0x580a, 0x049a,
- 0x7c54, 0x2830, 0x4078, 0x640c, 0x109c,
- 0x0454, 0x3430, 0x4c78, 0x700c, 0x1c9c,
- 0x1054, 0x4030, 0x5878, 0x7c0c, 0x289c,
- 0x1c54, 0x4c30, 0x6478, 0x040c, 0x349c,
- 0x2854, 0x5830, 0x7078, 0x100c, 0x409c,
- 0x3454, 0x6430, 0x7c78, 0x1c0c, 0x4c9c,
- 0x4054, 0x7030, 0x0478, 0x280c, 0x589c,
- 0x4c54, 0x7c30, 0x1078, 0x340c, 0x649c,
- 0x5854, 0x0430, 0x1c78, 0x400c, 0x709c,
- 0x6454, 0x1030, 0x2878, 0x4c0c, 0x7c9c,
- 0x7054, 0x1c30, 0x3478, 0x580c, 0x049c,
- 0x7c56, 0x2832, 0x407a, 0x640e, 0x109e,
- 0x0456, 0x3432, 0x4c7a, 0x700e, 0x1c9e,
- 0x1056, 0x4032, 0x587a, 0x7c0e, 0x289e,
- 0x1c56, 0x4c32, 0x647a, 0x040e, 0x349e,
- 0x2856, 0x5832, 0x707a, 0x100e, 0x409e,
- 0x3456, 0x6432, 0x7c7a, 0x1c0e, 0x4c9e,
- 0x4056, 0x7032, 0x047a, 0x280e, 0x589e,
- 0x4c56, 0x7c32, 0x107a, 0x340e, 0x649e,
- 0x5856, 0x0432, 0x1c7a, 0x400e, 0x709e,
- 0x6456, 0x1032, 0x287a, 0x4c0e, 0x7c9e,
- 0x7056, 0x1c32, 0x347a, 0x580e, 0x049e,
- 0x7c58, 0x2834, 0x407c, 0x6410, 0x10a0,
- 0x0458, 0x3434, 0x4c7c, 0x7010, 0x1ca0,
- 0x1058, 0x4034, 0x587c, 0x7c10, 0x28a0,
- 0x1c58, 0x4c34, 0x647c, 0x0410, 0x34a0,
- 0x2858, 0x5834, 0x707c, 0x1010, 0x40a0,
- 0x3458, 0x6434, 0x7c7c, 0x1c10, 0x4ca0,
- 0x4058, 0x7034, 0x047c, 0x2810, 0x58a0,
- 0x4c58, 0x7c34, 0x107c, 0x3410, 0x64a0,
- 0x5858, 0x0434, 0x1c7c, 0x4010, 0x70a0,
- 0x6458, 0x1034, 0x287c, 0x4c10, 0x7ca0,
- 0x7058, 0x1c34, 0x347c, 0x5810, 0x04a0,
- 0x8048, 0x2c24, 0x446c, 0x6800, 0x1490,
- 0x0848, 0x3824, 0x506c, 0x7400, 0x2090,
- 0x1448, 0x4424, 0x5c6c, 0x8000, 0x2c90,
- 0x2048, 0x5024, 0x686c, 0x0800, 0x3890,
- 0x2c48, 0x5c24, 0x746c, 0x1400, 0x4490,
- 0x3848, 0x6824, 0x806c, 0x2000, 0x5090,
- 0x4448, 0x7424, 0x086c, 0x2c00, 0x5c90,
- 0x5048, 0x8024, 0x146c, 0x3800, 0x6890,
- 0x5c48, 0x0824, 0x206c, 0x4400, 0x7490,
- 0x6848, 0x1424, 0x2c6c, 0x5000, 0x8090,
- 0x7448, 0x2024, 0x386c, 0x5c00, 0x0890,
- 0x804a, 0x2c26, 0x446e, 0x6802, 0x1492,
- 0x084a, 0x3826, 0x506e, 0x7402, 0x2092,
- 0x144a, 0x4426, 0x5c6e, 0x8002, 0x2c92,
- 0x204a, 0x5026, 0x686e, 0x0802, 0x3892,
- 0x2c4a, 0x5c26, 0x746e, 0x1402, 0x4492,
- 0x384a, 0x6826, 0x806e, 0x2002, 0x5092,
- 0x444a, 0x7426, 0x086e, 0x2c02, 0x5c92,
- 0x504a, 0x8026, 0x146e, 0x3802, 0x6892,
- 0x5c4a, 0x0826, 0x206e, 0x4402, 0x7492,
- 0x684a, 0x1426, 0x2c6e, 0x5002, 0x8092,
- 0x744a, 0x2026, 0x386e, 0x5c02, 0x0892,
- 0x804c, 0x2c28, 0x4470, 0x6804, 0x1494,
- 0x084c, 0x3828, 0x5070, 0x7404, 0x2094,
- 0x144c, 0x4428, 0x5c70, 0x8004, 0x2c94,
- 0x204c, 0x5028, 0x6870, 0x0804, 0x3894,
- 0x2c4c, 0x5c28, 0x7470, 0x1404, 0x4494,
- 0x384c, 0x6828, 0x8070, 0x2004, 0x5094,
- 0x444c, 0x7428, 0x0870, 0x2c04, 0x5c94,
- 0x504c, 0x8028, 0x1470, 0x3804, 0x6894,
- 0x5c4c, 0x0828, 0x2070, 0x4404, 0x7494,
- 0x684c, 0x1428, 0x2c70, 0x5004, 0x8094,
- 0x744c, 0x2028, 0x3870, 0x5c04, 0x0894,
- 0x804e, 0x2c2a, 0x4472, 0x6806, 0x1496,
- 0x084e, 0x382a, 0x5072, 0x7406, 0x2096,
- 0x144e, 0x442a, 0x5c72, 0x8006, 0x2c96,
- 0x204e, 0x502a, 0x6872, 0x0806, 0x3896,
- 0x2c4e, 0x5c2a, 0x7472, 0x1406, 0x4496,
- 0x384e, 0x682a, 0x8072, 0x2006, 0x5096,
- 0x444e, 0x742a, 0x0872, 0x2c06, 0x5c96,
- 0x504e, 0x802a, 0x1472, 0x3806, 0x6896,
- 0x5c4e, 0x082a, 0x2072, 0x4406, 0x7496,
- 0x684e, 0x142a, 0x2c72, 0x5006, 0x8096,
- 0x744e, 0x202a, 0x3872, 0x5c06, 0x0896,
- 0x8050, 0x2c2c, 0x4474, 0x6808, 0x1498,
- 0x0850, 0x382c, 0x5074, 0x7408, 0x2098,
- 0x1450, 0x442c, 0x5c74, 0x8008, 0x2c98,
- 0x2050, 0x502c, 0x6874, 0x0808, 0x3898,
- 0x2c50, 0x5c2c, 0x7474, 0x1408, 0x4498,
- 0x3850, 0x682c, 0x8074, 0x2008, 0x5098,
- 0x4450, 0x742c, 0x0874, 0x2c08, 0x5c98,
- 0x5050, 0x802c, 0x1474, 0x3808, 0x6898,
- 0x5c50, 0x082c, 0x2074, 0x4408, 0x7498,
- 0x6850, 0x142c, 0x2c74, 0x5008, 0x8098,
- 0x7450, 0x202c, 0x3874, 0x5c08, 0x0898,
- 0x8052, 0x2c2e, 0x4476, 0x680a, 0x149a,
- 0x0852, 0x382e, 0x5076, 0x740a, 0x209a,
- 0x1452, 0x442e, 0x5c76, 0x800a, 0x2c9a,
- 0x2052, 0x502e, 0x6876, 0x080a, 0x389a,
- 0x2c52, 0x5c2e, 0x7476, 0x140a, 0x449a,
- 0x3852, 0x682e, 0x8076, 0x200a, 0x509a,
- 0x4452, 0x742e, 0x0876, 0x2c0a, 0x5c9a,
- 0x5052, 0x802e, 0x1476, 0x380a, 0x689a,
- 0x5c52, 0x082e, 0x2076, 0x440a, 0x749a,
- 0x6852, 0x142e, 0x2c76, 0x500a, 0x809a,
- 0x7452, 0x202e, 0x3876, 0x5c0a, 0x089a,
- 0x8054, 0x2c30, 0x4478, 0x680c, 0x149c,
- 0x0854, 0x3830, 0x5078, 0x740c, 0x209c,
- 0x1454, 0x4430, 0x5c78, 0x800c, 0x2c9c,
- 0x2054, 0x5030, 0x6878, 0x080c, 0x389c,
- 0x2c54, 0x5c30, 0x7478, 0x140c, 0x449c,
- 0x3854, 0x6830, 0x8078, 0x200c, 0x509c,
- 0x4454, 0x7430, 0x0878, 0x2c0c, 0x5c9c,
- 0x5054, 0x8030, 0x1478, 0x380c, 0x689c,
- 0x5c54, 0x0830, 0x2078, 0x440c, 0x749c,
- 0x6854, 0x1430, 0x2c78, 0x500c, 0x809c,
- 0x7454, 0x2030, 0x3878, 0x5c0c, 0x089c,
- 0x8056, 0x2c32, 0x447a, 0x680e, 0x149e,
- 0x0856, 0x3832, 0x507a, 0x740e, 0x209e,
- 0x1456, 0x4432, 0x5c7a, 0x800e, 0x2c9e,
- 0x2056, 0x5032, 0x687a, 0x080e, 0x389e,
- 0x2c56, 0x5c32, 0x747a, 0x140e, 0x449e,
- 0x3856, 0x6832, 0x807a, 0x200e, 0x509e,
- 0x4456, 0x7432, 0x087a, 0x2c0e, 0x5c9e,
- 0x5056, 0x8032, 0x147a, 0x380e, 0x689e,
- 0x5c56, 0x0832, 0x207a, 0x440e, 0x749e,
- 0x6856, 0x1432, 0x2c7a, 0x500e, 0x809e,
- 0x7456, 0x2032, 0x387a, 0x5c0e, 0x089e,
- 0x8058, 0x2c34, 0x447c, 0x6810, 0x14a0,
- 0x0858, 0x3834, 0x507c, 0x7410, 0x20a0,
- 0x1458, 0x4434, 0x5c7c, 0x8010, 0x2ca0,
- 0x2058, 0x5034, 0x687c, 0x0810, 0x38a0,
- 0x2c58, 0x5c34, 0x747c, 0x1410, 0x44a0,
- 0x3858, 0x6834, 0x807c, 0x2010, 0x50a0,
- 0x4458, 0x7434, 0x087c, 0x2c10, 0x5ca0,
- 0x5058, 0x8034, 0x147c, 0x3810, 0x68a0,
- 0x5c58, 0x0834, 0x207c, 0x4410, 0x74a0,
- 0x6858, 0x1434, 0x2c7c, 0x5010, 0x80a0,
- 0x7458, 0x2034, 0x387c, 0x5c10, 0x08a0,
- 0x8448, 0x3024, 0x486c, 0x6c00, 0x1890,
- 0x0c48, 0x3c24, 0x546c, 0x7800, 0x2490,
- 0x1848, 0x4824, 0x606c, 0x8400, 0x3090,
- 0x2448, 0x5424, 0x6c6c, 0x0c00, 0x3c90,
- 0x3048, 0x6024, 0x786c, 0x1800, 0x4890,
- 0x3c48, 0x6c24, 0x846c, 0x2400, 0x5490,
- 0x4848, 0x7824, 0x0c6c, 0x3000, 0x6090,
- 0x5448, 0x8424, 0x186c, 0x3c00, 0x6c90,
- 0x6048, 0x0c24, 0x246c, 0x4800, 0x7890,
- 0x6c48, 0x1824, 0x306c, 0x5400, 0x8490,
- 0x7848, 0x2424, 0x3c6c, 0x6000, 0x0c90,
- 0x844a, 0x3026, 0x486e, 0x6c02, 0x1892,
- 0x0c4a, 0x3c26, 0x546e, 0x7802, 0x2492,
- 0x184a, 0x4826, 0x606e, 0x8402, 0x3092,
- 0x244a, 0x5426, 0x6c6e, 0x0c02, 0x3c92,
- 0x304a, 0x6026, 0x786e, 0x1802, 0x4892,
- 0x3c4a, 0x6c26, 0x846e, 0x2402, 0x5492,
- 0x484a, 0x7826, 0x0c6e, 0x3002, 0x6092,
- 0x544a, 0x8426, 0x186e, 0x3c02, 0x6c92,
- 0x604a, 0x0c26, 0x246e, 0x4802, 0x7892,
- 0x6c4a, 0x1826, 0x306e, 0x5402, 0x8492,
- 0x784a, 0x2426, 0x3c6e, 0x6002, 0x0c92,
- 0x844c, 0x3028, 0x4870, 0x6c04, 0x1894,
- 0x0c4c, 0x3c28, 0x5470, 0x7804, 0x2494,
- 0x184c, 0x4828, 0x6070, 0x8404, 0x3094,
- 0x244c, 0x5428, 0x6c70, 0x0c04, 0x3c94,
- 0x304c, 0x6028, 0x7870, 0x1804, 0x4894,
- 0x3c4c, 0x6c28, 0x8470, 0x2404, 0x5494,
- 0x484c, 0x7828, 0x0c70, 0x3004, 0x6094,
- 0x544c, 0x8428, 0x1870, 0x3c04, 0x6c94,
- 0x604c, 0x0c28, 0x2470, 0x4804, 0x7894,
- 0x6c4c, 0x1828, 0x3070, 0x5404, 0x8494,
- 0x784c, 0x2428, 0x3c70, 0x6004, 0x0c94,
- 0x844e, 0x302a, 0x4872, 0x6c06, 0x1896,
- 0x0c4e, 0x3c2a, 0x5472, 0x7806, 0x2496,
- 0x184e, 0x482a, 0x6072, 0x8406, 0x3096,
- 0x244e, 0x542a, 0x6c72, 0x0c06, 0x3c96,
- 0x304e, 0x602a, 0x7872, 0x1806, 0x4896,
- 0x3c4e, 0x6c2a, 0x8472, 0x2406, 0x5496,
- 0x484e, 0x782a, 0x0c72, 0x3006, 0x6096,
- 0x544e, 0x842a, 0x1872, 0x3c06, 0x6c96,
- 0x604e, 0x0c2a, 0x2472, 0x4806, 0x7896,
- 0x6c4e, 0x182a, 0x3072, 0x5406, 0x8496,
- 0x784e, 0x242a, 0x3c72, 0x6006, 0x0c96,
- 0x8450, 0x302c, 0x4874, 0x6c08, 0x1898,
- 0x0c50, 0x3c2c, 0x5474, 0x7808, 0x2498,
- 0x1850, 0x482c, 0x6074, 0x8408, 0x3098,
- 0x2450, 0x542c, 0x6c74, 0x0c08, 0x3c98,
- 0x3050, 0x602c, 0x7874, 0x1808, 0x4898,
- 0x3c50, 0x6c2c, 0x8474, 0x2408, 0x5498,
- 0x4850, 0x782c, 0x0c74, 0x3008, 0x6098,
- 0x5450, 0x842c, 0x1874, 0x3c08, 0x6c98,
- 0x6050, 0x0c2c, 0x2474, 0x4808, 0x7898,
- 0x6c50, 0x182c, 0x3074, 0x5408, 0x8498,
- 0x7850, 0x242c, 0x3c74, 0x6008, 0x0c98,
- 0x8452, 0x302e, 0x4876, 0x6c0a, 0x189a,
- 0x0c52, 0x3c2e, 0x5476, 0x780a, 0x249a,
- 0x1852, 0x482e, 0x6076, 0x840a, 0x309a,
- 0x2452, 0x542e, 0x6c76, 0x0c0a, 0x3c9a,
- 0x3052, 0x602e, 0x7876, 0x180a, 0x489a,
- 0x3c52, 0x6c2e, 0x8476, 0x240a, 0x549a,
- 0x4852, 0x782e, 0x0c76, 0x300a, 0x609a,
- 0x5452, 0x842e, 0x1876, 0x3c0a, 0x6c9a,
- 0x6052, 0x0c2e, 0x2476, 0x480a, 0x789a,
- 0x6c52, 0x182e, 0x3076, 0x540a, 0x849a,
- 0x7852, 0x242e, 0x3c76, 0x600a, 0x0c9a,
- 0x8454, 0x3030, 0x4878, 0x6c0c, 0x189c,
- 0x0c54, 0x3c30, 0x5478, 0x780c, 0x249c,
- 0x1854, 0x4830, 0x6078, 0x840c, 0x309c,
- 0x2454, 0x5430, 0x6c78, 0x0c0c, 0x3c9c,
- 0x3054, 0x6030, 0x7878, 0x180c, 0x489c,
- 0x3c54, 0x6c30, 0x8478, 0x240c, 0x549c,
- 0x4854, 0x7830, 0x0c78, 0x300c, 0x609c,
- 0x5454, 0x8430, 0x1878, 0x3c0c, 0x6c9c,
- 0x6054, 0x0c30, 0x2478, 0x480c, 0x789c,
- 0x6c54, 0x1830, 0x3078, 0x540c, 0x849c,
- 0x7854, 0x2430, 0x3c78, 0x600c, 0x0c9c,
- 0x8456, 0x3032, 0x487a, 0x6c0e, 0x189e,
- 0x0c56, 0x3c32, 0x547a, 0x780e, 0x249e,
- 0x1856, 0x4832, 0x607a, 0x840e, 0x309e,
- 0x2456, 0x5432, 0x6c7a, 0x0c0e, 0x3c9e,
- 0x3056, 0x6032, 0x787a, 0x180e, 0x489e,
- 0x3c56, 0x6c32, 0x847a, 0x240e, 0x549e,
- 0x4856, 0x7832, 0x0c7a, 0x300e, 0x609e,
- 0x5456, 0x8432, 0x187a, 0x3c0e, 0x6c9e,
- 0x6056, 0x0c32, 0x247a, 0x480e, 0x789e,
- 0x6c56, 0x1832, 0x307a, 0x540e, 0x849e,
- 0x7856, 0x2432, 0x3c7a, 0x600e, 0x0c9e,
- 0x8458, 0x3034, 0x487c, 0x6c10, 0x18a0,
- 0x0c58, 0x3c34, 0x547c, 0x7810, 0x24a0,
- 0x1858, 0x4834, 0x607c, 0x8410, 0x30a0,
- 0x2458, 0x5434, 0x6c7c, 0x0c10, 0x3ca0,
- 0x3058, 0x6034, 0x787c, 0x1810, 0x48a0,
- 0x3c58, 0x6c34, 0x847c, 0x2410, 0x54a0,
- 0x4858, 0x7834, 0x0c7c, 0x3010, 0x60a0,
- 0x5458, 0x8434, 0x187c, 0x3c10, 0x6ca0,
- 0x6058, 0x0c34, 0x247c, 0x4810, 0x78a0,
- 0x6c58, 0x1834, 0x307c, 0x5410, 0x84a0,
- 0x7858, 0x2434, 0x3c7c, 0x6010, 0x0ca0,
- 0x7c48, 0x2824, 0x406c, 0x6400, 0x1090,
- 0x0448, 0x3424, 0x4c6c, 0x7000, 0x1c90,
- 0x1048, 0x4024, 0x586c, 0x7c00, 0x2890,
- 0x1c48, 0x4c24, 0x646c, 0x0400, 0x3490,
- 0x2848, 0x5824, 0x706c, 0x1000, 0x4090,
- 0x3448, 0x6424, 0x7c6c, 0x1c00, 0x4c90,
- 0x4048, 0x7024, 0x046c, 0x2800, 0x5890,
- 0x4c48, 0x7c24, 0x106c, 0x3400, 0x6490,
- 0x5848, 0x0424, 0x1c6c, 0x4000, 0x7090,
- 0x6448, 0x1024, 0x286c, 0x4c00, 0x7c90,
- 0x7048, 0x1c24, 0x346c, 0x5800, 0x0490,
- 0x7c4a, 0x2826, 0x406e, 0x6402, 0x1092,
- 0x044a, 0x3426, 0x4c6e, 0x7002, 0x1c92,
- 0x104a, 0x4026, 0x586e, 0x7c02, 0x2892,
- 0x1c4a, 0x4c26, 0x646e, 0x0402, 0x3492,
- 0x284a, 0x5826, 0x706e, 0x1002, 0x4092,
- 0x344a, 0x6426, 0x7c6e, 0x1c02, 0x4c92,
- 0x404a, 0x7026, 0x046e, 0x2802, 0x5892,
- 0x4c4a, 0x7c26, 0x106e, 0x3402, 0x6492,
- 0x584a, 0x0426, 0x1c6e, 0x4002, 0x7092,
- 0x644a, 0x1026, 0x286e, 0x4c02, 0x7c92,
- 0x704a, 0x1c26, 0x346e, 0x5802, 0x0492,
- 0x7c4c, 0x2828, 0x4070, 0x6404, 0x1094,
- 0x044c, 0x3428, 0x4c70, 0x7004, 0x1c94,
- 0x104c, 0x4028, 0x5870, 0x7c04, 0x2894,
- 0x1c4c, 0x4c28, 0x6470, 0x0404, 0x3494,
- 0x284c, 0x5828, 0x7070, 0x1004, 0x4094,
- 0x285a, 0x5836, 0x707e, 0x1012, 0x40a2,
- 0x345a, 0x6436, 0x7c7e, 0x1c12, 0x4ca2,
- 0x405a, 0x7036, 0x047e, 0x2812, 0x58a2,
- 0x4c5a, 0x7c36, 0x107e, 0x3412, 0x64a2,
- 0x585a, 0x0436, 0x1c7e, 0x4012, 0x70a2,
- 0x645a, 0x1036, 0x287e, 0x4c12, 0x7ca2,
- 0x705a, 0x1c36, 0x347e, 0x5812, 0x04a2,
- 0x7c5a, 0x2836, 0x407e, 0x6412, 0x10a2,
- 0x045a, 0x3436, 0x4c7e, 0x7012, 0x1ca2,
- 0x105a, 0x4036, 0x587e, 0x7c12, 0x28a2,
- 0x1c5a, 0x4c36, 0x647e, 0x0412, 0x34a2,
- 0x285c, 0x5838, 0x7080, 0x1014, 0x40a4,
- 0x345c, 0x6438, 0x7c80, 0x1c14, 0x4ca4,
- 0x405c, 0x7038, 0x0480, 0x2814, 0x58a4,
- 0x4c5c, 0x7c38, 0x1080, 0x3414, 0x64a4,
- 0x585c, 0x0438, 0x1c80, 0x4014, 0x70a4,
- 0x645c, 0x1038, 0x2880, 0x4c14, 0x7ca4,
- 0x705c, 0x1c38, 0x3480, 0x5814, 0x04a4,
- 0x7c5c, 0x2838, 0x4080, 0x6414, 0x10a4,
- 0x045c, 0x3438, 0x4c80, 0x7014, 0x1ca4,
- 0x105c, 0x4038, 0x5880, 0x7c14, 0x28a4,
- 0x1c5c, 0x4c38, 0x6480, 0x0414, 0x34a4,
- 0x285e, 0x583a, 0x7082, 0x1016, 0x40a6,
- 0x345e, 0x643a, 0x7c82, 0x1c16, 0x4ca6,
- 0x405e, 0x703a, 0x0482, 0x2816, 0x58a6,
- 0x4c5e, 0x7c3a, 0x1082, 0x3416, 0x64a6,
- 0x585e, 0x043a, 0x1c82, 0x4016, 0x70a6,
- 0x645e, 0x103a, 0x2882, 0x4c16, 0x7ca6,
- 0x705e, 0x1c3a, 0x3482, 0x5816, 0x04a6,
- 0x7c5e, 0x283a, 0x4082, 0x6416, 0x10a6,
- 0x045e, 0x343a, 0x4c82, 0x7016, 0x1ca6,
- 0x105e, 0x403a, 0x5882, 0x7c16, 0x28a6,
- 0x1c5e, 0x4c3a, 0x6482, 0x0416, 0x34a6,
- 0x2860, 0x583c, 0x7084, 0x1018, 0x40a8,
- 0x3460, 0x643c, 0x7c84, 0x1c18, 0x4ca8,
- 0x4060, 0x703c, 0x0484, 0x2818, 0x58a8,
- 0x4c60, 0x7c3c, 0x1084, 0x3418, 0x64a8,
- 0x5860, 0x043c, 0x1c84, 0x4018, 0x70a8,
- 0x6460, 0x103c, 0x2884, 0x4c18, 0x7ca8,
- 0x7060, 0x1c3c, 0x3484, 0x5818, 0x04a8,
- 0x7c60, 0x283c, 0x4084, 0x6418, 0x10a8,
- 0x0460, 0x343c, 0x4c84, 0x7018, 0x1ca8,
- 0x1060, 0x403c, 0x5884, 0x7c18, 0x28a8,
- 0x1c60, 0x4c3c, 0x6484, 0x0418, 0x34a8,
- 0x2862, 0x583e, 0x7086, 0x101a, 0x40aa,
- 0x3462, 0x643e, 0x7c86, 0x1c1a, 0x4caa,
- 0x4062, 0x703e, 0x0486, 0x281a, 0x58aa,
- 0x4c62, 0x7c3e, 0x1086, 0x341a, 0x64aa,
- 0x5862, 0x043e, 0x1c86, 0x401a, 0x70aa,
- 0x6462, 0x103e, 0x2886, 0x4c1a, 0x7caa,
- 0x7062, 0x1c3e, 0x3486, 0x581a, 0x04aa,
- 0x7c62, 0x283e, 0x4086, 0x641a, 0x10aa,
- 0x0462, 0x343e, 0x4c86, 0x701a, 0x1caa,
- 0x1062, 0x403e, 0x5886, 0x7c1a, 0x28aa,
- 0x1c62, 0x4c3e, 0x6486, 0x041a, 0x34aa,
- 0x2864, 0x5840, 0x7088, 0x101c, 0x40ac,
- 0x3464, 0x6440, 0x7c88, 0x1c1c, 0x4cac,
- 0x4064, 0x7040, 0x0488, 0x281c, 0x58ac,
- 0x4c64, 0x7c40, 0x1088, 0x341c, 0x64ac,
- 0x5864, 0x0440, 0x1c88, 0x401c, 0x70ac,
- 0x6464, 0x1040, 0x2888, 0x4c1c, 0x7cac,
- 0x7064, 0x1c40, 0x3488, 0x581c, 0x04ac,
- 0x7c64, 0x2840, 0x4088, 0x641c, 0x10ac,
- 0x0464, 0x3440, 0x4c88, 0x701c, 0x1cac,
- 0x1064, 0x4040, 0x5888, 0x7c1c, 0x28ac,
- 0x1c64, 0x4c40, 0x6488, 0x041c, 0x34ac,
- 0x2866, 0x5842, 0x708a, 0x101e, 0x40ae,
- 0x3466, 0x6442, 0x7c8a, 0x1c1e, 0x4cae,
- 0x4066, 0x7042, 0x048a, 0x281e, 0x58ae,
- 0x4c66, 0x7c42, 0x108a, 0x341e, 0x64ae,
- 0x5866, 0x0442, 0x1c8a, 0x401e, 0x70ae,
- 0x6466, 0x1042, 0x288a, 0x4c1e, 0x7cae,
- 0x7066, 0x1c42, 0x348a, 0x581e, 0x04ae,
- 0x7c66, 0x2842, 0x408a, 0x641e, 0x10ae,
- 0x0466, 0x3442, 0x4c8a, 0x701e, 0x1cae,
- 0x1066, 0x4042, 0x588a, 0x7c1e, 0x28ae,
- 0x1c66, 0x4c42, 0x648a, 0x041e, 0x34ae,
- 0x2868, 0x5844, 0x708c, 0x1020, 0x40b0,
- 0x3468, 0x6444, 0x7c8c, 0x1c20, 0x4cb0,
- 0x4068, 0x7044, 0x048c, 0x2820, 0x58b0,
- 0x4c68, 0x7c44, 0x108c, 0x3420, 0x64b0,
- 0x5868, 0x0444, 0x1c8c, 0x4020, 0x70b0,
- 0x6468, 0x1044, 0x288c, 0x4c20, 0x7cb0,
- 0x7068, 0x1c44, 0x348c, 0x5820, 0x04b0,
- 0x7c68, 0x2844, 0x408c, 0x6420, 0x10b0,
- 0x0468, 0x3444, 0x4c8c, 0x7020, 0x1cb0,
- 0x1068, 0x4044, 0x588c, 0x7c20, 0x28b0,
- 0x1c68, 0x4c44, 0x648c, 0x0420, 0x34b0,
- 0x286a, 0x5846, 0x708e, 0x1022, 0x40b2,
- 0x346a, 0x6446, 0x7c8e, 0x1c22, 0x4cb2,
- 0x406a, 0x7046, 0x048e, 0x2822, 0x58b2,
- 0x4c6a, 0x7c46, 0x108e, 0x3422, 0x64b2,
- 0x586a, 0x0446, 0x1c8e, 0x4022, 0x70b2,
- 0x646a, 0x1046, 0x288e, 0x4c22, 0x7cb2,
- 0x706a, 0x1c46, 0x348e, 0x5822, 0x04b2,
- 0x7c6a, 0x2846, 0x408e, 0x6422, 0x10b2,
- 0x046a, 0x3446, 0x4c8e, 0x7022, 0x1cb2,
- 0x106a, 0x4046, 0x588e, 0x7c22, 0x28b2,
- 0x1c6a, 0x4c46, 0x648e, 0x0422, 0x34b2,
- 0x2c5a, 0x5c36, 0x747e, 0x1412, 0x44a2,
- 0x385a, 0x6836, 0x807e, 0x2012, 0x50a2,
- 0x445a, 0x7436, 0x087e, 0x2c12, 0x5ca2,
- 0x505a, 0x8036, 0x147e, 0x3812, 0x68a2,
- 0x5c5a, 0x0836, 0x207e, 0x4412, 0x74a2,
- 0x685a, 0x1436, 0x2c7e, 0x5012, 0x80a2,
- 0x745a, 0x2036, 0x387e, 0x5c12, 0x08a2,
- 0x805a, 0x2c36, 0x447e, 0x6812, 0x14a2,
- 0x085a, 0x3836, 0x507e, 0x7412, 0x20a2,
- 0x145a, 0x4436, 0x5c7e, 0x8012, 0x2ca2,
- 0x205a, 0x5036, 0x687e, 0x0812, 0x38a2,
- 0x2c5c, 0x5c38, 0x7480, 0x1414, 0x44a4,
- 0x385c, 0x6838, 0x8080, 0x2014, 0x50a4,
- 0x445c, 0x7438, 0x0880, 0x2c14, 0x5ca4,
- 0x505c, 0x8038, 0x1480, 0x3814, 0x68a4,
- 0x5c5c, 0x0838, 0x2080, 0x4414, 0x74a4,
- 0x685c, 0x1438, 0x2c80, 0x5014, 0x80a4,
- 0x745c, 0x2038, 0x3880, 0x5c14, 0x08a4,
- 0x805c, 0x2c38, 0x4480, 0x6814, 0x14a4,
- 0x085c, 0x3838, 0x5080, 0x7414, 0x20a4,
- 0x145c, 0x4438, 0x5c80, 0x8014, 0x2ca4,
- 0x205c, 0x5038, 0x6880, 0x0814, 0x38a4,
- 0x2c5e, 0x5c3a, 0x7482, 0x1416, 0x44a6,
- 0x385e, 0x683a, 0x8082, 0x2016, 0x50a6,
- 0x445e, 0x743a, 0x0882, 0x2c16, 0x5ca6,
- 0x505e, 0x803a, 0x1482, 0x3816, 0x68a6,
- 0x5c5e, 0x083a, 0x2082, 0x4416, 0x74a6,
- 0x685e, 0x143a, 0x2c82, 0x5016, 0x80a6,
- 0x745e, 0x203a, 0x3882, 0x5c16, 0x08a6,
- 0x805e, 0x2c3a, 0x4482, 0x6816, 0x14a6,
- 0x085e, 0x383a, 0x5082, 0x7416, 0x20a6,
- 0x145e, 0x443a, 0x5c82, 0x8016, 0x2ca6,
- 0x205e, 0x503a, 0x6882, 0x0816, 0x38a6,
- 0x2c60, 0x5c3c, 0x7484, 0x1418, 0x44a8,
- 0x3860, 0x683c, 0x8084, 0x2018, 0x50a8,
- 0x4460, 0x743c, 0x0884, 0x2c18, 0x5ca8,
- 0x5060, 0x803c, 0x1484, 0x3818, 0x68a8,
- 0x5c60, 0x083c, 0x2084, 0x4418, 0x74a8,
- 0x6860, 0x143c, 0x2c84, 0x5018, 0x80a8,
- 0x7460, 0x203c, 0x3884, 0x5c18, 0x08a8,
- 0x8060, 0x2c3c, 0x4484, 0x6818, 0x14a8,
- 0x0860, 0x383c, 0x5084, 0x7418, 0x20a8,
- 0x1460, 0x443c, 0x5c84, 0x8018, 0x2ca8,
- 0x2060, 0x503c, 0x6884, 0x0818, 0x38a8,
- 0x2c62, 0x5c3e, 0x7486, 0x141a, 0x44aa,
- 0x3862, 0x683e, 0x8086, 0x201a, 0x50aa,
- 0x4462, 0x743e, 0x0886, 0x2c1a, 0x5caa,
- 0x5062, 0x803e, 0x1486, 0x381a, 0x68aa,
- 0x5c62, 0x083e, 0x2086, 0x441a, 0x74aa,
- 0x6862, 0x143e, 0x2c86, 0x501a, 0x80aa,
- 0x7462, 0x203e, 0x3886, 0x5c1a, 0x08aa,
- 0x8062, 0x2c3e, 0x4486, 0x681a, 0x14aa,
- 0x0862, 0x383e, 0x5086, 0x741a, 0x20aa,
- 0x1462, 0x443e, 0x5c86, 0x801a, 0x2caa,
- 0x2062, 0x503e, 0x6886, 0x081a, 0x38aa,
- 0x2c64, 0x5c40, 0x7488, 0x141c, 0x44ac,
- 0x3864, 0x6840, 0x8088, 0x201c, 0x50ac,
- 0x4464, 0x7440, 0x0888, 0x2c1c, 0x5cac,
- 0x5064, 0x8040, 0x1488, 0x381c, 0x68ac,
- 0x5c64, 0x0840, 0x2088, 0x441c, 0x74ac,
- 0x6864, 0x1440, 0x2c88, 0x501c, 0x80ac,
- 0x7464, 0x2040, 0x3888, 0x5c1c, 0x08ac,
- 0x8064, 0x2c40, 0x4488, 0x681c, 0x14ac,
- 0x0864, 0x3840, 0x5088, 0x741c, 0x20ac,
- 0x1464, 0x4440, 0x5c88, 0x801c, 0x2cac,
- 0x2064, 0x5040, 0x6888, 0x081c, 0x38ac,
- 0x2c66, 0x5c42, 0x748a, 0x141e, 0x44ae,
- 0x3866, 0x6842, 0x808a, 0x201e, 0x50ae,
- 0x4466, 0x7442, 0x088a, 0x2c1e, 0x5cae,
- 0x5066, 0x8042, 0x148a, 0x381e, 0x68ae,
- 0x5c66, 0x0842, 0x208a, 0x441e, 0x74ae,
- 0x6866, 0x1442, 0x2c8a, 0x501e, 0x80ae,
- 0x7466, 0x2042, 0x388a, 0x5c1e, 0x08ae,
- 0x8066, 0x2c42, 0x448a, 0x681e, 0x14ae,
- 0x0866, 0x3842, 0x508a, 0x741e, 0x20ae,
- 0x1466, 0x4442, 0x5c8a, 0x801e, 0x2cae,
- 0x2066, 0x5042, 0x688a, 0x081e, 0x38ae,
- 0x2c68, 0x5c44, 0x748c, 0x1420, 0x44b0,
- 0x3868, 0x6844, 0x808c, 0x2020, 0x50b0,
- 0x4468, 0x7444, 0x088c, 0x2c20, 0x5cb0,
- 0x5068, 0x8044, 0x148c, 0x3820, 0x68b0,
- 0x5c68, 0x0844, 0x208c, 0x4420, 0x74b0,
- 0x6868, 0x1444, 0x2c8c, 0x5020, 0x80b0,
- 0x7468, 0x2044, 0x388c, 0x5c20, 0x08b0,
- 0x8068, 0x2c44, 0x448c, 0x6820, 0x14b0,
- 0x0868, 0x3844, 0x508c, 0x7420, 0x20b0,
- 0x1468, 0x4444, 0x5c8c, 0x8020, 0x2cb0,
- 0x2068, 0x5044, 0x688c, 0x0820, 0x38b0,
- 0x2c6a, 0x5c46, 0x748e, 0x1422, 0x44b2,
- 0x386a, 0x6846, 0x808e, 0x2022, 0x50b2,
- 0x446a, 0x7446, 0x088e, 0x2c22, 0x5cb2,
- 0x506a, 0x8046, 0x148e, 0x3822, 0x68b2,
- 0x5c6a, 0x0846, 0x208e, 0x4422, 0x74b2,
- 0x686a, 0x1446, 0x2c8e, 0x5022, 0x80b2,
- 0x746a, 0x2046, 0x388e, 0x5c22, 0x08b2,
- 0x806a, 0x2c46, 0x448e, 0x6822, 0x14b2,
- 0x086a, 0x3846, 0x508e, 0x7422, 0x20b2,
- 0x146a, 0x4446, 0x5c8e, 0x8022, 0x2cb2,
- 0x206a, 0x5046, 0x688e, 0x0822, 0x38b2,
- 0x305a, 0x6036, 0x787e, 0x1812, 0x48a2,
- 0x3c5a, 0x6c36, 0x847e, 0x2412, 0x54a2,
- 0x485a, 0x7836, 0x0c7e, 0x3012, 0x60a2,
- 0x545a, 0x8436, 0x187e, 0x3c12, 0x6ca2,
- 0x605a, 0x0c36, 0x247e, 0x4812, 0x78a2,
- 0x6c5a, 0x1836, 0x307e, 0x5412, 0x84a2,
- 0x785a, 0x2436, 0x3c7e, 0x6012, 0x0ca2,
- 0x845a, 0x3036, 0x487e, 0x6c12, 0x18a2,
- 0x0c5a, 0x3c36, 0x547e, 0x7812, 0x24a2,
- 0x185a, 0x4836, 0x607e, 0x8412, 0x30a2,
- 0x245a, 0x5436, 0x6c7e, 0x0c12, 0x3ca2,
- 0x305c, 0x6038, 0x7880, 0x1814, 0x48a4,
- 0x3c5c, 0x6c38, 0x8480, 0x2414, 0x54a4,
- 0x485c, 0x7838, 0x0c80, 0x3014, 0x60a4,
- 0x545c, 0x8438, 0x1880, 0x3c14, 0x6ca4,
- 0x605c, 0x0c38, 0x2480, 0x4814, 0x78a4,
- 0x6c5c, 0x1838, 0x3080, 0x5414, 0x84a4,
- 0x785c, 0x2438, 0x3c80, 0x6014, 0x0ca4,
- 0x845c, 0x3038, 0x4880, 0x6c14, 0x18a4,
- 0x0c5c, 0x3c38, 0x5480, 0x7814, 0x24a4,
- 0x185c, 0x4838, 0x6080, 0x8414, 0x30a4,
- 0x245c, 0x5438, 0x6c80, 0x0c14, 0x3ca4,
- 0x305e, 0x603a, 0x7882, 0x1816, 0x48a6,
- 0x3c5e, 0x6c3a, 0x8482, 0x2416, 0x54a6,
- 0x485e, 0x783a, 0x0c82, 0x3016, 0x60a6,
- 0x545e, 0x843a, 0x1882, 0x3c16, 0x6ca6,
- 0x605e, 0x0c3a, 0x2482, 0x4816, 0x78a6,
- 0x6c5e, 0x183a, 0x3082, 0x5416, 0x84a6,
- 0x785e, 0x243a, 0x3c82, 0x6016, 0x0ca6,
- 0x845e, 0x303a, 0x4882, 0x6c16, 0x18a6,
- 0x0c5e, 0x3c3a, 0x5482, 0x7816, 0x24a6,
- 0x185e, 0x483a, 0x6082, 0x8416, 0x30a6,
- 0x245e, 0x543a, 0x6c82, 0x0c16, 0x3ca6,
- 0x3060, 0x603c, 0x7884, 0x1818, 0x48a8,
- 0x3c60, 0x6c3c, 0x8484, 0x2418, 0x54a8,
- 0x4860, 0x783c, 0x0c84, 0x3018, 0x60a8,
- 0x5460, 0x843c, 0x1884, 0x3c18, 0x6ca8,
- 0x6060, 0x0c3c, 0x2484, 0x4818, 0x78a8,
- 0x6c60, 0x183c, 0x3084, 0x5418, 0x84a8,
- 0x7860, 0x243c, 0x3c84, 0x6018, 0x0ca8,
- 0x8460, 0x303c, 0x4884, 0x6c18, 0x18a8,
- 0x0c60, 0x3c3c, 0x5484, 0x7818, 0x24a8,
- 0x1860, 0x483c, 0x6084, 0x8418, 0x30a8,
- 0x2460, 0x543c, 0x6c84, 0x0c18, 0x3ca8,
- 0x3062, 0x603e, 0x7886, 0x181a, 0x48aa,
- 0x3c62, 0x6c3e, 0x8486, 0x241a, 0x54aa,
- 0x4862, 0x783e, 0x0c86, 0x301a, 0x60aa,
- 0x5462, 0x843e, 0x1886, 0x3c1a, 0x6caa,
- 0x6062, 0x0c3e, 0x2486, 0x481a, 0x78aa,
- 0x6c62, 0x183e, 0x3086, 0x541a, 0x84aa,
- 0x7862, 0x243e, 0x3c86, 0x601a, 0x0caa,
- 0x8462, 0x303e, 0x4886, 0x6c1a, 0x18aa,
- 0x0c62, 0x3c3e, 0x5486, 0x781a, 0x24aa,
- 0x1862, 0x483e, 0x6086, 0x841a, 0x30aa,
- 0x2462, 0x543e, 0x6c86, 0x0c1a, 0x3caa,
- 0x3064, 0x6040, 0x7888, 0x181c, 0x48ac,
- 0x3c64, 0x6c40, 0x8488, 0x241c, 0x54ac,
- 0x4864, 0x7840, 0x0c88, 0x301c, 0x60ac,
- 0x5464, 0x8440, 0x1888, 0x3c1c, 0x6cac,
- 0x6064, 0x0c40, 0x2488, 0x481c, 0x78ac,
- 0x6c64, 0x1840, 0x3088, 0x541c, 0x84ac,
- 0x7864, 0x2440, 0x3c88, 0x601c, 0x0cac,
- 0x8464, 0x3040, 0x4888, 0x6c1c, 0x18ac,
- 0x0c64, 0x3c40, 0x5488, 0x781c, 0x24ac,
- 0x1864, 0x4840, 0x6088, 0x841c, 0x30ac,
- 0x2464, 0x5440, 0x6c88, 0x0c1c, 0x3cac,
- 0x3066, 0x6042, 0x788a, 0x181e, 0x48ae,
- 0x3c66, 0x6c42, 0x848a, 0x241e, 0x54ae,
- 0x4866, 0x7842, 0x0c8a, 0x301e, 0x60ae,
- 0x5466, 0x8442, 0x188a, 0x3c1e, 0x6cae,
- 0x6066, 0x0c42, 0x248a, 0x481e, 0x78ae,
- 0x6c66, 0x1842, 0x308a, 0x541e, 0x84ae,
- 0x7866, 0x2442, 0x3c8a, 0x601e, 0x0cae,
- 0x8466, 0x3042, 0x488a, 0x6c1e, 0x18ae,
- 0x0c66, 0x3c42, 0x548a, 0x781e, 0x24ae,
- 0x1866, 0x4842, 0x608a, 0x841e, 0x30ae,
- 0x2466, 0x5442, 0x6c8a, 0x0c1e, 0x3cae,
- 0x3068, 0x6044, 0x788c, 0x1820, 0x48b0,
- 0x3c68, 0x6c44, 0x848c, 0x2420, 0x54b0,
- 0x4868, 0x7844, 0x0c8c, 0x3020, 0x60b0,
- 0x5468, 0x8444, 0x188c, 0x3c20, 0x6cb0,
- 0x6068, 0x0c44, 0x248c, 0x4820, 0x78b0,
- 0x6c68, 0x1844, 0x308c, 0x5420, 0x84b0,
- 0x7868, 0x2444, 0x3c8c, 0x6020, 0x0cb0,
- 0x8468, 0x3044, 0x488c, 0x6c20, 0x18b0,
- 0x0c68, 0x3c44, 0x548c, 0x7820, 0x24b0,
- 0x1868, 0x4844, 0x608c, 0x8420, 0x30b0,
- 0x2468, 0x5444, 0x6c8c, 0x0c20, 0x3cb0,
- 0x306a, 0x6046, 0x788e, 0x1822, 0x48b2,
- 0x3c6a, 0x6c46, 0x848e, 0x2422, 0x54b2,
- 0x486a, 0x7846, 0x0c8e, 0x3022, 0x60b2,
- 0x546a, 0x8446, 0x188e, 0x3c22, 0x6cb2,
- 0x606a, 0x0c46, 0x248e, 0x4822, 0x78b2,
- 0x6c6a, 0x1846, 0x308e, 0x5422, 0x84b2,
- 0x786a, 0x2446, 0x3c8e, 0x6022, 0x0cb2,
- 0x846a, 0x3046, 0x488e, 0x6c22, 0x18b2,
- 0x0c6a, 0x3c46, 0x548e, 0x7822, 0x24b2,
- 0x186a, 0x4846, 0x608e, 0x8422, 0x30b2,
- 0x246a, 0x5446, 0x6c8e, 0x0c22, 0x3cb2,
- 0x285a, 0x5836, 0x707e, 0x1012, 0x40a2,
- 0x345a, 0x6436, 0x7c7e, 0x1c12, 0x4ca2,
- 0x405a, 0x7036, 0x047e, 0x2812, 0x58a2,
- 0x4c5a, 0x7c36, 0x107e, 0x3412, 0x64a2,
- 0x585a, 0x0436, 0x1c7e, 0x4012, 0x70a2,
- 0x645a, 0x1036, 0x287e, 0x4c12, 0x7ca2,
- 0x705a, 0x1c36, 0x347e, 0x5812, 0x04a2,
- 0x7c5a, 0x2836, 0x407e, 0x6412, 0x10a2,
- 0x045a, 0x3436, 0x4c7e, 0x7012, 0x1ca2,
- 0x105a, 0x4036, 0x587e, 0x7c12, 0x28a2,
- 0x1c5a, 0x4c36, 0x647e, 0x0412, 0x34a2,
- 0x285c, 0x5838, 0x7080, 0x1014, 0x40a4,
- 0x345c, 0x6438, 0x7c80, 0x1c14, 0x4ca4,
- 0x405c, 0x7038, 0x0480, 0x2814, 0x58a4,
- 0x4c5c, 0x7c38, 0x1080, 0x3414, 0x64a4,
- 0x585c, 0x0438, 0x1c80, 0x4014, 0x70a4,
- 0x645c, 0x1038, 0x2880, 0x4c14, 0x7ca4,
- 0x705c, 0x1c38, 0x3480, 0x5814, 0x04a4,
- 0x7c5c, 0x2838, 0x4080, 0x6414, 0x10a4,
- 0x045c, 0x3438, 0x4c80, 0x7014, 0x1ca4,
- 0x105c, 0x4038, 0x5880, 0x7c14, 0x28a4,
- 0x1c5c, 0x4c38, 0x6480, 0x0414, 0x34a4,
- 0x285e, 0x583a, 0x7082, 0x1016, 0x40a6,
- 0x345e, 0x643a, 0x7c82, 0x1c16, 0x4ca6,
- 0x405e, 0x703a, 0x0482, 0x2816, 0x58a6,
- 0x4c5e, 0x7c3a, 0x1082, 0x3416, 0x64a6,
- 0x585e, 0x043a, 0x1c82, 0x4016, 0x70a6,
-};
-
-static const uint16_t dv_place_720p60[2*10*27*5] = {
- 0x1230, 0x3618, 0x4848, 0x0000, 0x2460,
- 0x2430, 0x4818, 0x0048, 0x1200, 0x3660,
- 0x3630, 0x0018, 0x1248, 0x2400, 0x4860,
- 0x4830, 0x1218, 0x2448, 0x3600, 0x0060,
- 0x0030, 0x2418, 0x3648, 0x4800, 0x1260,
- 0x1232, 0x361a, 0x484a, 0x0002, 0x2462,
- 0x2432, 0x481a, 0x004a, 0x1202, 0x3662,
- 0x3632, 0x001a, 0x124a, 0x2402, 0x4862,
- 0x4832, 0x121a, 0x244a, 0x3602, 0x0062,
- 0x0032, 0x241a, 0x364a, 0x4802, 0x1262,
- 0x1234, 0x361c, 0x484c, 0x0004, 0x2464,
- 0x2434, 0x481c, 0x004c, 0x1204, 0x3664,
- 0x3634, 0x001c, 0x124c, 0x2404, 0x4864,
- 0x4834, 0x121c, 0x244c, 0x3604, 0x0064,
- 0x0034, 0x241c, 0x364c, 0x4804, 0x1264,
- 0x1236, 0x361e, 0x484e, 0x0006, 0x2466,
- 0x2436, 0x481e, 0x004e, 0x1206, 0x3666,
- 0x3636, 0x001e, 0x124e, 0x2406, 0x4866,
- 0x4836, 0x121e, 0x244e, 0x3606, 0x0066,
- 0x0036, 0x241e, 0x364e, 0x4806, 0x1266,
- 0x1238, 0x3620, 0x4850, 0x0008, 0x2468,
- 0x2438, 0x4820, 0x0050, 0x1208, 0x3668,
- 0x3638, 0x0020, 0x1250, 0x2408, 0x4868,
- 0x4838, 0x1220, 0x2450, 0x3608, 0x0068,
- 0x0038, 0x2420, 0x3650, 0x4808, 0x1268,
- 0x123a, 0x3622, 0x4852, 0x000a, 0x246a,
- 0x243a, 0x4822, 0x0052, 0x120a, 0x366a,
- 0x363a, 0x0022, 0x1252, 0x240a, 0x486a,
- 0x483a, 0x1222, 0x2452, 0x360a, 0x006a,
- 0x003a, 0x2422, 0x3652, 0x480a, 0x126a,
- 0x1430, 0x3818, 0x4a48, 0x0200, 0x2660,
- 0x2630, 0x4a18, 0x0248, 0x1400, 0x3860,
- 0x3830, 0x0218, 0x1448, 0x2600, 0x4a60,
- 0x4a30, 0x1418, 0x2648, 0x3800, 0x0260,
- 0x0230, 0x2618, 0x3848, 0x4a00, 0x1460,
- 0x1432, 0x381a, 0x4a4a, 0x0202, 0x2662,
- 0x2632, 0x4a1a, 0x024a, 0x1402, 0x3862,
- 0x3832, 0x021a, 0x144a, 0x2602, 0x4a62,
- 0x4a32, 0x141a, 0x264a, 0x3802, 0x0262,
- 0x0232, 0x261a, 0x384a, 0x4a02, 0x1462,
- 0x1434, 0x381c, 0x4a4c, 0x0204, 0x2664,
- 0x2634, 0x4a1c, 0x024c, 0x1404, 0x3864,
- 0x3834, 0x021c, 0x144c, 0x2604, 0x4a64,
- 0x4a34, 0x141c, 0x264c, 0x3804, 0x0264,
- 0x0234, 0x261c, 0x384c, 0x4a04, 0x1464,
- 0x1436, 0x381e, 0x4a4e, 0x0206, 0x2666,
- 0x2636, 0x4a1e, 0x024e, 0x1406, 0x3866,
- 0x3836, 0x021e, 0x144e, 0x2606, 0x4a66,
- 0x4a36, 0x141e, 0x264e, 0x3806, 0x0266,
- 0x0236, 0x261e, 0x384e, 0x4a06, 0x1466,
- 0x1438, 0x3820, 0x4a50, 0x0208, 0x2668,
- 0x2638, 0x4a20, 0x0250, 0x1408, 0x3868,
- 0x3838, 0x0220, 0x1450, 0x2608, 0x4a68,
- 0x4a38, 0x1420, 0x2650, 0x3808, 0x0268,
- 0x0238, 0x2620, 0x3850, 0x4a08, 0x1468,
- 0x143a, 0x3822, 0x4a52, 0x020a, 0x266a,
- 0x263a, 0x4a22, 0x0252, 0x140a, 0x386a,
- 0x383a, 0x0222, 0x1452, 0x260a, 0x4a6a,
- 0x4a3a, 0x1422, 0x2652, 0x380a, 0x026a,
- 0x023a, 0x2622, 0x3852, 0x4a0a, 0x146a,
- 0x1630, 0x3a18, 0x4c48, 0x0400, 0x2860,
- 0x2830, 0x4c18, 0x0448, 0x1600, 0x3a60,
- 0x3a30, 0x0418, 0x1648, 0x2800, 0x4c60,
- 0x4c30, 0x1618, 0x2848, 0x3a00, 0x0460,
- 0x0430, 0x2818, 0x3a48, 0x4c00, 0x1660,
- 0x1632, 0x3a1a, 0x4c4a, 0x0402, 0x2862,
- 0x2832, 0x4c1a, 0x044a, 0x1602, 0x3a62,
- 0x3a32, 0x041a, 0x164a, 0x2802, 0x4c62,
- 0x4c32, 0x161a, 0x284a, 0x3a02, 0x0462,
- 0x0432, 0x281a, 0x3a4a, 0x4c02, 0x1662,
- 0x1634, 0x3a1c, 0x4c4c, 0x0404, 0x2864,
- 0x2834, 0x4c1c, 0x044c, 0x1604, 0x3a64,
- 0x3a34, 0x041c, 0x164c, 0x2804, 0x4c64,
- 0x4c34, 0x161c, 0x284c, 0x3a04, 0x0464,
- 0x0434, 0x281c, 0x3a4c, 0x4c04, 0x1664,
- 0x1636, 0x3a1e, 0x4c4e, 0x0406, 0x2866,
- 0x2836, 0x4c1e, 0x044e, 0x1606, 0x3a66,
- 0x3a36, 0x041e, 0x164e, 0x2806, 0x4c66,
- 0x4c36, 0x161e, 0x284e, 0x3a06, 0x0466,
- 0x0436, 0x281e, 0x3a4e, 0x4c06, 0x1666,
- 0x1638, 0x3a20, 0x4c50, 0x0408, 0x2868,
- 0x2838, 0x4c20, 0x0450, 0x1608, 0x3a68,
- 0x3a38, 0x0420, 0x1650, 0x2808, 0x4c68,
- 0x4c38, 0x1620, 0x2850, 0x3a08, 0x0468,
- 0x0438, 0x2820, 0x3a50, 0x4c08, 0x1668,
- 0x163a, 0x3a22, 0x4c52, 0x040a, 0x286a,
- 0x283a, 0x4c22, 0x0452, 0x160a, 0x3a6a,
- 0x3a3a, 0x0422, 0x1652, 0x280a, 0x4c6a,
- 0x4c3a, 0x1622, 0x2852, 0x3a0a, 0x046a,
- 0x043a, 0x2822, 0x3a52, 0x4c0a, 0x166a,
- 0x1830, 0x3c18, 0x4e48, 0x0600, 0x2a60,
- 0x2a30, 0x4e18, 0x0648, 0x1800, 0x3c60,
- 0x3c30, 0x0618, 0x1848, 0x2a00, 0x4e60,
- 0x4e30, 0x1818, 0x2a48, 0x3c00, 0x0660,
- 0x0630, 0x2a18, 0x3c48, 0x4e00, 0x1860,
- 0x1832, 0x3c1a, 0x4e4a, 0x0602, 0x2a62,
- 0x2a32, 0x4e1a, 0x064a, 0x1802, 0x3c62,
- 0x3c32, 0x061a, 0x184a, 0x2a02, 0x4e62,
- 0x4e32, 0x181a, 0x2a4a, 0x3c02, 0x0662,
- 0x0632, 0x2a1a, 0x3c4a, 0x4e02, 0x1862,
- 0x1834, 0x3c1c, 0x4e4c, 0x0604, 0x2a64,
- 0x2a34, 0x4e1c, 0x064c, 0x1804, 0x3c64,
- 0x3c34, 0x061c, 0x184c, 0x2a04, 0x4e64,
- 0x4e34, 0x181c, 0x2a4c, 0x3c04, 0x0664,
- 0x0634, 0x2a1c, 0x3c4c, 0x4e04, 0x1864,
- 0x1836, 0x3c1e, 0x4e4e, 0x0606, 0x2a66,
- 0x2a36, 0x4e1e, 0x064e, 0x1806, 0x3c66,
- 0x3c36, 0x061e, 0x184e, 0x2a06, 0x4e66,
- 0x4e36, 0x181e, 0x2a4e, 0x3c06, 0x0666,
- 0x0636, 0x2a1e, 0x3c4e, 0x4e06, 0x1866,
- 0x1838, 0x3c20, 0x4e50, 0x0608, 0x2a68,
- 0x2a38, 0x4e20, 0x0650, 0x1808, 0x3c68,
- 0x3c38, 0x0620, 0x1850, 0x2a08, 0x4e68,
- 0x4e38, 0x1820, 0x2a50, 0x3c08, 0x0668,
- 0x0638, 0x2a20, 0x3c50, 0x4e08, 0x1868,
- 0x183a, 0x3c22, 0x4e52, 0x060a, 0x2a6a,
- 0x2a3a, 0x4e22, 0x0652, 0x180a, 0x3c6a,
- 0x3c3a, 0x0622, 0x1852, 0x2a0a, 0x4e6a,
- 0x4e3a, 0x1822, 0x2a52, 0x3c0a, 0x066a,
- 0x063a, 0x2a22, 0x3c52, 0x4e0a, 0x186a,
- 0x1a30, 0x3e18, 0x5048, 0x0800, 0x2c60,
- 0x2c30, 0x5018, 0x0848, 0x1a00, 0x3e60,
- 0x3e30, 0x0818, 0x1a48, 0x2c00, 0x5060,
- 0x5030, 0x1a18, 0x2c48, 0x3e00, 0x0860,
- 0x0830, 0x2c18, 0x3e48, 0x5000, 0x1a60,
- 0x1a32, 0x3e1a, 0x504a, 0x0802, 0x2c62,
- 0x2c32, 0x501a, 0x084a, 0x1a02, 0x3e62,
- 0x3e32, 0x081a, 0x1a4a, 0x2c02, 0x5062,
- 0x5032, 0x1a1a, 0x2c4a, 0x3e02, 0x0862,
- 0x0832, 0x2c1a, 0x3e4a, 0x5002, 0x1a62,
- 0x1a34, 0x3e1c, 0x504c, 0x0804, 0x2c64,
- 0x2c34, 0x501c, 0x084c, 0x1a04, 0x3e64,
- 0x3e34, 0x081c, 0x1a4c, 0x2c04, 0x5064,
- 0x5034, 0x1a1c, 0x2c4c, 0x3e04, 0x0864,
- 0x0834, 0x2c1c, 0x3e4c, 0x5004, 0x1a64,
- 0x1a36, 0x3e1e, 0x504e, 0x0806, 0x2c66,
- 0x2c36, 0x501e, 0x084e, 0x1a06, 0x3e66,
- 0x3e36, 0x081e, 0x1a4e, 0x2c06, 0x5066,
- 0x5036, 0x1a1e, 0x2c4e, 0x3e06, 0x0866,
- 0x0836, 0x2c1e, 0x3e4e, 0x5006, 0x1a66,
- 0x1a38, 0x3e20, 0x5050, 0x0808, 0x2c68,
- 0x2c38, 0x5020, 0x0850, 0x1a08, 0x3e68,
- 0x3e38, 0x0820, 0x1a50, 0x2c08, 0x5068,
- 0x5038, 0x1a20, 0x2c50, 0x3e08, 0x0868,
- 0x0838, 0x2c20, 0x3e50, 0x5008, 0x1a68,
- 0x1a3a, 0x3e22, 0x5052, 0x080a, 0x2c6a,
- 0x2c3a, 0x5022, 0x0852, 0x1a0a, 0x3e6a,
- 0x3e3a, 0x0822, 0x1a52, 0x2c0a, 0x506a,
- 0x503a, 0x1a22, 0x2c52, 0x3e0a, 0x086a,
- 0x083a, 0x2c22, 0x3e52, 0x500a, 0x1a6a,
- 0x1c30, 0x4018, 0x5248, 0x0a00, 0x2e60,
- 0x2e30, 0x5218, 0x0a48, 0x1c00, 0x4060,
- 0x4030, 0x0a18, 0x1c48, 0x2e00, 0x5260,
- 0x5230, 0x1c18, 0x2e48, 0x4000, 0x0a60,
- 0x0a30, 0x2e18, 0x4048, 0x5200, 0x1c60,
- 0x1c32, 0x401a, 0x524a, 0x0a02, 0x2e62,
- 0x2e32, 0x521a, 0x0a4a, 0x1c02, 0x4062,
- 0x4032, 0x0a1a, 0x1c4a, 0x2e02, 0x5262,
- 0x5232, 0x1c1a, 0x2e4a, 0x4002, 0x0a62,
- 0x0a32, 0x2e1a, 0x404a, 0x5202, 0x1c62,
- 0x1c34, 0x401c, 0x524c, 0x0a04, 0x2e64,
- 0x2e34, 0x521c, 0x0a4c, 0x1c04, 0x4064,
- 0x4034, 0x0a1c, 0x1c4c, 0x2e04, 0x5264,
- 0x5234, 0x1c1c, 0x2e4c, 0x4004, 0x0a64,
- 0x0a34, 0x2e1c, 0x404c, 0x5204, 0x1c64,
- 0x1c36, 0x401e, 0x524e, 0x0a06, 0x2e66,
- 0x2e36, 0x521e, 0x0a4e, 0x1c06, 0x4066,
- 0x4036, 0x0a1e, 0x1c4e, 0x2e06, 0x5266,
- 0x5236, 0x1c1e, 0x2e4e, 0x4006, 0x0a66,
- 0x0a36, 0x2e1e, 0x404e, 0x5206, 0x1c66,
- 0x1c38, 0x4020, 0x5250, 0x0a08, 0x2e68,
- 0x2e38, 0x5220, 0x0a50, 0x1c08, 0x4068,
- 0x4038, 0x0a20, 0x1c50, 0x2e08, 0x5268,
- 0x5238, 0x1c20, 0x2e50, 0x4008, 0x0a68,
- 0x0a38, 0x2e20, 0x4050, 0x5208, 0x1c68,
- 0x1c3a, 0x4022, 0x5252, 0x0a0a, 0x2e6a,
- 0x2e3a, 0x5222, 0x0a52, 0x1c0a, 0x406a,
- 0x403a, 0x0a22, 0x1c52, 0x2e0a, 0x526a,
- 0x523a, 0x1c22, 0x2e52, 0x400a, 0x0a6a,
- 0x0a3a, 0x2e22, 0x4052, 0x520a, 0x1c6a,
- 0x1e30, 0x4218, 0x5448, 0x0c00, 0x3060,
- 0x3030, 0x5418, 0x0c48, 0x1e00, 0x4260,
- 0x4230, 0x0c18, 0x1e48, 0x3000, 0x5460,
- 0x5430, 0x1e18, 0x3048, 0x4200, 0x0c60,
- 0x0c30, 0x3018, 0x4248, 0x5400, 0x1e60,
- 0x1e32, 0x421a, 0x544a, 0x0c02, 0x3062,
- 0x3032, 0x541a, 0x0c4a, 0x1e02, 0x4262,
- 0x4232, 0x0c1a, 0x1e4a, 0x3002, 0x5462,
- 0x5432, 0x1e1a, 0x304a, 0x4202, 0x0c62,
- 0x0c32, 0x301a, 0x424a, 0x5402, 0x1e62,
- 0x1e34, 0x421c, 0x544c, 0x0c04, 0x3064,
- 0x3034, 0x541c, 0x0c4c, 0x1e04, 0x4264,
- 0x4234, 0x0c1c, 0x1e4c, 0x3004, 0x5464,
- 0x5434, 0x1e1c, 0x304c, 0x4204, 0x0c64,
- 0x0c34, 0x301c, 0x424c, 0x5404, 0x1e64,
- 0x1e36, 0x421e, 0x544e, 0x0c06, 0x3066,
- 0x3036, 0x541e, 0x0c4e, 0x1e06, 0x4266,
- 0x4236, 0x0c1e, 0x1e4e, 0x3006, 0x5466,
- 0x5436, 0x1e1e, 0x304e, 0x4206, 0x0c66,
- 0x0c36, 0x301e, 0x424e, 0x5406, 0x1e66,
- 0x1e38, 0x4220, 0x5450, 0x0c08, 0x3068,
- 0x3038, 0x5420, 0x0c50, 0x1e08, 0x4268,
- 0x4238, 0x0c20, 0x1e50, 0x3008, 0x5468,
- 0x5438, 0x1e20, 0x3050, 0x4208, 0x0c68,
- 0x0c38, 0x3020, 0x4250, 0x5408, 0x1e68,
- 0x1e3a, 0x4222, 0x5452, 0x0c0a, 0x306a,
- 0x303a, 0x5422, 0x0c52, 0x1e0a, 0x426a,
- 0x423a, 0x0c22, 0x1e52, 0x300a, 0x546a,
- 0x543a, 0x1e22, 0x3052, 0x420a, 0x0c6a,
- 0x0c3a, 0x3022, 0x4252, 0x540a, 0x1e6a,
- 0x2030, 0x4418, 0x5648, 0x0e00, 0x3260,
- 0x3230, 0x5618, 0x0e48, 0x2000, 0x4460,
- 0x4430, 0x0e18, 0x2048, 0x3200, 0x5660,
- 0x5630, 0x2018, 0x3248, 0x4400, 0x0e60,
- 0x0e30, 0x3218, 0x4448, 0x5600, 0x2060,
- 0x2032, 0x441a, 0x564a, 0x0e02, 0x3262,
- 0x3232, 0x561a, 0x0e4a, 0x2002, 0x4462,
- 0x4432, 0x0e1a, 0x204a, 0x3202, 0x5662,
- 0x5632, 0x201a, 0x324a, 0x4402, 0x0e62,
- 0x0e32, 0x321a, 0x444a, 0x5602, 0x2062,
- 0x2034, 0x441c, 0x564c, 0x0e04, 0x3264,
- 0x3234, 0x561c, 0x0e4c, 0x2004, 0x4464,
- 0x4434, 0x0e1c, 0x204c, 0x3204, 0x5664,
- 0x5634, 0x201c, 0x324c, 0x4404, 0x0e64,
- 0x0e34, 0x321c, 0x444c, 0x5604, 0x2064,
- 0x2036, 0x441e, 0x564e, 0x0e06, 0x3266,
- 0x3236, 0x561e, 0x0e4e, 0x2006, 0x4466,
- 0x4436, 0x0e1e, 0x204e, 0x3206, 0x5666,
- 0x5636, 0x201e, 0x324e, 0x4406, 0x0e66,
- 0x0e36, 0x321e, 0x444e, 0x5606, 0x2066,
- 0x2038, 0x4420, 0x5650, 0x0e08, 0x3268,
- 0x3238, 0x5620, 0x0e50, 0x2008, 0x4468,
- 0x4438, 0x0e20, 0x2050, 0x3208, 0x5668,
- 0x5638, 0x2020, 0x3250, 0x4408, 0x0e68,
- 0x0e38, 0x3220, 0x4450, 0x5608, 0x2068,
- 0x203a, 0x4422, 0x5652, 0x0e0a, 0x326a,
- 0x323a, 0x5622, 0x0e52, 0x200a, 0x446a,
- 0x443a, 0x0e22, 0x2052, 0x320a, 0x566a,
- 0x563a, 0x2022, 0x3252, 0x440a, 0x0e6a,
- 0x0e3a, 0x3222, 0x4452, 0x560a, 0x206a,
- 0x2230, 0x4618, 0x5848, 0x1000, 0x3460,
- 0x3430, 0x5818, 0x1048, 0x2200, 0x4660,
- 0x4630, 0x1018, 0x2248, 0x3400, 0x5860,
- 0x5830, 0x2218, 0x3448, 0x4600, 0x1060,
- 0x1030, 0x3418, 0x4648, 0x5800, 0x2260,
- 0x2232, 0x461a, 0x584a, 0x1002, 0x3462,
- 0x3432, 0x581a, 0x104a, 0x2202, 0x4662,
- 0x4632, 0x101a, 0x224a, 0x3402, 0x5862,
- 0x5832, 0x221a, 0x344a, 0x4602, 0x1062,
- 0x1032, 0x341a, 0x464a, 0x5802, 0x2262,
- 0x2234, 0x461c, 0x584c, 0x1004, 0x3464,
- 0x3434, 0x581c, 0x104c, 0x2204, 0x4664,
- 0x4634, 0x101c, 0x224c, 0x3404, 0x5864,
- 0x5834, 0x221c, 0x344c, 0x4604, 0x1064,
- 0x1034, 0x341c, 0x464c, 0x5804, 0x2264,
- 0x2236, 0x461e, 0x584e, 0x1006, 0x3466,
- 0x3436, 0x581e, 0x104e, 0x2206, 0x4666,
- 0x4636, 0x101e, 0x224e, 0x3406, 0x5866,
- 0x5836, 0x221e, 0x344e, 0x4606, 0x1066,
- 0x1036, 0x341e, 0x464e, 0x5806, 0x2266,
- 0x2238, 0x4620, 0x5850, 0x1008, 0x3468,
- 0x3438, 0x5820, 0x1050, 0x2208, 0x4668,
- 0x4638, 0x1020, 0x2250, 0x3408, 0x5868,
- 0x5838, 0x2220, 0x3450, 0x4608, 0x1068,
- 0x1038, 0x3420, 0x4650, 0x5808, 0x2268,
- 0x223a, 0x4622, 0x5852, 0x100a, 0x346a,
- 0x343a, 0x5822, 0x1052, 0x220a, 0x466a,
- 0x463a, 0x1022, 0x2252, 0x340a, 0x586a,
- 0x583a, 0x2222, 0x3452, 0x460a, 0x106a,
- 0x103a, 0x3422, 0x4652, 0x580a, 0x226a,
- 0x363c, 0x0024, 0x1254, 0x240c, 0x486c,
- 0x483c, 0x1224, 0x2454, 0x360c, 0x006c,
- 0x003c, 0x2424, 0x3654, 0x480c, 0x126c,
- 0x123c, 0x3624, 0x4854, 0x000c, 0x246c,
- 0x243c, 0x4824, 0x0054, 0x120c, 0x366c,
- 0x363e, 0x0026, 0x1256, 0x240e, 0x486e,
- 0x483e, 0x1226, 0x2456, 0x360e, 0x006e,
- 0x003e, 0x2426, 0x3656, 0x480e, 0x126e,
- 0x123e, 0x3626, 0x4856, 0x000e, 0x246e,
- 0x243e, 0x4826, 0x0056, 0x120e, 0x366e,
- 0x3640, 0x0028, 0x1258, 0x2410, 0x4870,
- 0x4840, 0x1228, 0x2458, 0x3610, 0x0070,
- 0x0040, 0x2428, 0x3658, 0x4810, 0x1270,
- 0x1240, 0x3628, 0x4858, 0x0010, 0x2470,
- 0x2440, 0x4828, 0x0058, 0x1210, 0x3670,
- 0x3642, 0x002a, 0x125a, 0x2412, 0x4872,
- 0x4842, 0x122a, 0x245a, 0x3612, 0x0072,
- 0x0042, 0x242a, 0x365a, 0x4812, 0x1272,
- 0x1242, 0x362a, 0x485a, 0x0012, 0x2472,
- 0x2442, 0x482a, 0x005a, 0x1212, 0x3672,
- 0x3644, 0x002c, 0x125c, 0x2414, 0x4874,
- 0x4844, 0x122c, 0x245c, 0x3614, 0x0074,
- 0x0044, 0x242c, 0x365c, 0x4814, 0x1274,
- 0x1244, 0x362c, 0x485c, 0x0014, 0x2474,
- 0x2444, 0x482c, 0x005c, 0x1214, 0x3674,
- 0x3646, 0x002e, 0x125e, 0x2416, 0x4876,
- 0x4846, 0x122e, 0x245e, 0x3616, 0x0076,
- 0x0046, 0x242e, 0x365e, 0x4816, 0x1276,
- 0x1246, 0x362e, 0x485e, 0x0016, 0x2476,
- 0x2446, 0x482e, 0x005e, 0x1216, 0x3676,
- 0x383c, 0x0224, 0x1454, 0x260c, 0x4a6c,
- 0x4a3c, 0x1424, 0x2654, 0x380c, 0x026c,
- 0x023c, 0x2624, 0x3854, 0x4a0c, 0x146c,
- 0x143c, 0x3824, 0x4a54, 0x020c, 0x266c,
- 0x263c, 0x4a24, 0x0254, 0x140c, 0x386c,
- 0x383e, 0x0226, 0x1456, 0x260e, 0x4a6e,
- 0x4a3e, 0x1426, 0x2656, 0x380e, 0x026e,
- 0x023e, 0x2626, 0x3856, 0x4a0e, 0x146e,
- 0x143e, 0x3826, 0x4a56, 0x020e, 0x266e,
- 0x263e, 0x4a26, 0x0256, 0x140e, 0x386e,
- 0x3840, 0x0228, 0x1458, 0x2610, 0x4a70,
- 0x4a40, 0x1428, 0x2658, 0x3810, 0x0270,
- 0x0240, 0x2628, 0x3858, 0x4a10, 0x1470,
- 0x1440, 0x3828, 0x4a58, 0x0210, 0x2670,
- 0x2640, 0x4a28, 0x0258, 0x1410, 0x3870,
- 0x3842, 0x022a, 0x145a, 0x2612, 0x4a72,
- 0x4a42, 0x142a, 0x265a, 0x3812, 0x0272,
- 0x0242, 0x262a, 0x385a, 0x4a12, 0x1472,
- 0x1442, 0x382a, 0x4a5a, 0x0212, 0x2672,
- 0x2642, 0x4a2a, 0x025a, 0x1412, 0x3872,
- 0x3844, 0x022c, 0x145c, 0x2614, 0x4a74,
- 0x4a44, 0x142c, 0x265c, 0x3814, 0x0274,
- 0x0244, 0x262c, 0x385c, 0x4a14, 0x1474,
- 0x1444, 0x382c, 0x4a5c, 0x0214, 0x2674,
- 0x2644, 0x4a2c, 0x025c, 0x1414, 0x3874,
- 0x3846, 0x022e, 0x145e, 0x2616, 0x4a76,
- 0x4a46, 0x142e, 0x265e, 0x3816, 0x0276,
- 0x0246, 0x262e, 0x385e, 0x4a16, 0x1476,
- 0x1446, 0x382e, 0x4a5e, 0x0216, 0x2676,
- 0x2646, 0x4a2e, 0x025e, 0x1416, 0x3876,
- 0x3a3c, 0x0424, 0x1654, 0x280c, 0x4c6c,
- 0x4c3c, 0x1624, 0x2854, 0x3a0c, 0x046c,
- 0x043c, 0x2824, 0x3a54, 0x4c0c, 0x166c,
- 0x163c, 0x3a24, 0x4c54, 0x040c, 0x286c,
- 0x283c, 0x4c24, 0x0454, 0x160c, 0x3a6c,
- 0x3a3e, 0x0426, 0x1656, 0x280e, 0x4c6e,
- 0x4c3e, 0x1626, 0x2856, 0x3a0e, 0x046e,
- 0x043e, 0x2826, 0x3a56, 0x4c0e, 0x166e,
- 0x163e, 0x3a26, 0x4c56, 0x040e, 0x286e,
- 0x283e, 0x4c26, 0x0456, 0x160e, 0x3a6e,
- 0x3a40, 0x0428, 0x1658, 0x2810, 0x4c70,
- 0x4c40, 0x1628, 0x2858, 0x3a10, 0x0470,
- 0x0440, 0x2828, 0x3a58, 0x4c10, 0x1670,
- 0x1640, 0x3a28, 0x4c58, 0x0410, 0x2870,
- 0x2840, 0x4c28, 0x0458, 0x1610, 0x3a70,
- 0x3a42, 0x042a, 0x165a, 0x2812, 0x4c72,
- 0x4c42, 0x162a, 0x285a, 0x3a12, 0x0472,
- 0x0442, 0x282a, 0x3a5a, 0x4c12, 0x1672,
- 0x1642, 0x3a2a, 0x4c5a, 0x0412, 0x2872,
- 0x2842, 0x4c2a, 0x045a, 0x1612, 0x3a72,
- 0x3a44, 0x042c, 0x165c, 0x2814, 0x4c74,
- 0x4c44, 0x162c, 0x285c, 0x3a14, 0x0474,
- 0x0444, 0x282c, 0x3a5c, 0x4c14, 0x1674,
- 0x1644, 0x3a2c, 0x4c5c, 0x0414, 0x2874,
- 0x2844, 0x4c2c, 0x045c, 0x1614, 0x3a74,
- 0x3a46, 0x042e, 0x165e, 0x2816, 0x4c76,
- 0x4c46, 0x162e, 0x285e, 0x3a16, 0x0476,
- 0x0446, 0x282e, 0x3a5e, 0x4c16, 0x1676,
- 0x1646, 0x3a2e, 0x4c5e, 0x0416, 0x2876,
- 0x2846, 0x4c2e, 0x045e, 0x1616, 0x3a76,
- 0x3c3c, 0x0624, 0x1854, 0x2a0c, 0x4e6c,
- 0x4e3c, 0x1824, 0x2a54, 0x3c0c, 0x066c,
- 0x063c, 0x2a24, 0x3c54, 0x4e0c, 0x186c,
- 0x183c, 0x3c24, 0x4e54, 0x060c, 0x2a6c,
- 0x2a3c, 0x4e24, 0x0654, 0x180c, 0x3c6c,
- 0x3c3e, 0x0626, 0x1856, 0x2a0e, 0x4e6e,
- 0x4e3e, 0x1826, 0x2a56, 0x3c0e, 0x066e,
- 0x063e, 0x2a26, 0x3c56, 0x4e0e, 0x186e,
- 0x183e, 0x3c26, 0x4e56, 0x060e, 0x2a6e,
- 0x2a3e, 0x4e26, 0x0656, 0x180e, 0x3c6e,
- 0x3c40, 0x0628, 0x1858, 0x2a10, 0x4e70,
- 0x4e40, 0x1828, 0x2a58, 0x3c10, 0x0670,
- 0x0640, 0x2a28, 0x3c58, 0x4e10, 0x1870,
- 0x1840, 0x3c28, 0x4e58, 0x0610, 0x2a70,
- 0x2a40, 0x4e28, 0x0658, 0x1810, 0x3c70,
- 0x3c42, 0x062a, 0x185a, 0x2a12, 0x4e72,
- 0x4e42, 0x182a, 0x2a5a, 0x3c12, 0x0672,
- 0x0642, 0x2a2a, 0x3c5a, 0x4e12, 0x1872,
- 0x1842, 0x3c2a, 0x4e5a, 0x0612, 0x2a72,
- 0x2a42, 0x4e2a, 0x065a, 0x1812, 0x3c72,
- 0x3c44, 0x062c, 0x185c, 0x2a14, 0x4e74,
- 0x4e44, 0x182c, 0x2a5c, 0x3c14, 0x0674,
- 0x0644, 0x2a2c, 0x3c5c, 0x4e14, 0x1874,
- 0x1844, 0x3c2c, 0x4e5c, 0x0614, 0x2a74,
- 0x2a44, 0x4e2c, 0x065c, 0x1814, 0x3c74,
- 0x3c46, 0x062e, 0x185e, 0x2a16, 0x4e76,
- 0x4e46, 0x182e, 0x2a5e, 0x3c16, 0x0676,
- 0x0646, 0x2a2e, 0x3c5e, 0x4e16, 0x1876,
- 0x1846, 0x3c2e, 0x4e5e, 0x0616, 0x2a76,
- 0x2a46, 0x4e2e, 0x065e, 0x1816, 0x3c76,
- 0x3e3c, 0x0824, 0x1a54, 0x2c0c, 0x506c,
- 0x503c, 0x1a24, 0x2c54, 0x3e0c, 0x086c,
- 0x083c, 0x2c24, 0x3e54, 0x500c, 0x1a6c,
- 0x1a3c, 0x3e24, 0x5054, 0x080c, 0x2c6c,
- 0x2c3c, 0x5024, 0x0854, 0x1a0c, 0x3e6c,
- 0x3e3e, 0x0826, 0x1a56, 0x2c0e, 0x506e,
- 0x503e, 0x1a26, 0x2c56, 0x3e0e, 0x086e,
- 0x083e, 0x2c26, 0x3e56, 0x500e, 0x1a6e,
- 0x1a3e, 0x3e26, 0x5056, 0x080e, 0x2c6e,
- 0x2c3e, 0x5026, 0x0856, 0x1a0e, 0x3e6e,
- 0x3e40, 0x0828, 0x1a58, 0x2c10, 0x5070,
- 0x5040, 0x1a28, 0x2c58, 0x3e10, 0x0870,
- 0x0840, 0x2c28, 0x3e58, 0x5010, 0x1a70,
- 0x1a40, 0x3e28, 0x5058, 0x0810, 0x2c70,
- 0x2c40, 0x5028, 0x0858, 0x1a10, 0x3e70,
- 0x3e42, 0x082a, 0x1a5a, 0x2c12, 0x5072,
- 0x5042, 0x1a2a, 0x2c5a, 0x3e12, 0x0872,
- 0x0842, 0x2c2a, 0x3e5a, 0x5012, 0x1a72,
- 0x1a42, 0x3e2a, 0x505a, 0x0812, 0x2c72,
- 0x2c42, 0x502a, 0x085a, 0x1a12, 0x3e72,
- 0x3e44, 0x082c, 0x1a5c, 0x2c14, 0x5074,
- 0x5044, 0x1a2c, 0x2c5c, 0x3e14, 0x0874,
- 0x0844, 0x2c2c, 0x3e5c, 0x5014, 0x1a74,
- 0x1a44, 0x3e2c, 0x505c, 0x0814, 0x2c74,
- 0x2c44, 0x502c, 0x085c, 0x1a14, 0x3e74,
- 0x3e46, 0x082e, 0x1a5e, 0x2c16, 0x5076,
- 0x5046, 0x1a2e, 0x2c5e, 0x3e16, 0x0876,
- 0x0846, 0x2c2e, 0x3e5e, 0x5016, 0x1a76,
- 0x1a46, 0x3e2e, 0x505e, 0x0816, 0x2c76,
- 0x2c46, 0x502e, 0x085e, 0x1a16, 0x3e76,
- 0x403c, 0x0a24, 0x1c54, 0x2e0c, 0x526c,
- 0x523c, 0x1c24, 0x2e54, 0x400c, 0x0a6c,
- 0x0a3c, 0x2e24, 0x4054, 0x520c, 0x1c6c,
- 0x1c3c, 0x4024, 0x5254, 0x0a0c, 0x2e6c,
- 0x2e3c, 0x5224, 0x0a54, 0x1c0c, 0x406c,
- 0x403e, 0x0a26, 0x1c56, 0x2e0e, 0x526e,
- 0x523e, 0x1c26, 0x2e56, 0x400e, 0x0a6e,
- 0x0a3e, 0x2e26, 0x4056, 0x520e, 0x1c6e,
- 0x1c3e, 0x4026, 0x5256, 0x0a0e, 0x2e6e,
- 0x2e3e, 0x5226, 0x0a56, 0x1c0e, 0x406e,
- 0x4040, 0x0a28, 0x1c58, 0x2e10, 0x5270,
- 0x5240, 0x1c28, 0x2e58, 0x4010, 0x0a70,
- 0x0a40, 0x2e28, 0x4058, 0x5210, 0x1c70,
- 0x1c40, 0x4028, 0x5258, 0x0a10, 0x2e70,
- 0x2e40, 0x5228, 0x0a58, 0x1c10, 0x4070,
- 0x4042, 0x0a2a, 0x1c5a, 0x2e12, 0x5272,
- 0x5242, 0x1c2a, 0x2e5a, 0x4012, 0x0a72,
- 0x0a42, 0x2e2a, 0x405a, 0x5212, 0x1c72,
- 0x1c42, 0x402a, 0x525a, 0x0a12, 0x2e72,
- 0x2e42, 0x522a, 0x0a5a, 0x1c12, 0x4072,
- 0x4044, 0x0a2c, 0x1c5c, 0x2e14, 0x5274,
- 0x5244, 0x1c2c, 0x2e5c, 0x4014, 0x0a74,
- 0x0a44, 0x2e2c, 0x405c, 0x5214, 0x1c74,
- 0x1c44, 0x402c, 0x525c, 0x0a14, 0x2e74,
- 0x2e44, 0x522c, 0x0a5c, 0x1c14, 0x4074,
- 0x4046, 0x0a2e, 0x1c5e, 0x2e16, 0x5276,
- 0x5246, 0x1c2e, 0x2e5e, 0x4016, 0x0a76,
- 0x0a46, 0x2e2e, 0x405e, 0x5216, 0x1c76,
- 0x1c46, 0x402e, 0x525e, 0x0a16, 0x2e76,
- 0x2e46, 0x522e, 0x0a5e, 0x1c16, 0x4076,
- 0x423c, 0x0c24, 0x1e54, 0x300c, 0x546c,
- 0x543c, 0x1e24, 0x3054, 0x420c, 0x0c6c,
- 0x0c3c, 0x3024, 0x4254, 0x540c, 0x1e6c,
- 0x1e3c, 0x4224, 0x5454, 0x0c0c, 0x306c,
- 0x303c, 0x5424, 0x0c54, 0x1e0c, 0x426c,
- 0x423e, 0x0c26, 0x1e56, 0x300e, 0x546e,
- 0x543e, 0x1e26, 0x3056, 0x420e, 0x0c6e,
- 0x0c3e, 0x3026, 0x4256, 0x540e, 0x1e6e,
- 0x1e3e, 0x4226, 0x5456, 0x0c0e, 0x306e,
- 0x303e, 0x5426, 0x0c56, 0x1e0e, 0x426e,
- 0x4240, 0x0c28, 0x1e58, 0x3010, 0x5470,
- 0x5440, 0x1e28, 0x3058, 0x4210, 0x0c70,
- 0x0c40, 0x3028, 0x4258, 0x5410, 0x1e70,
- 0x1e40, 0x4228, 0x5458, 0x0c10, 0x3070,
- 0x3040, 0x5428, 0x0c58, 0x1e10, 0x4270,
- 0x4242, 0x0c2a, 0x1e5a, 0x3012, 0x5472,
- 0x5442, 0x1e2a, 0x305a, 0x4212, 0x0c72,
- 0x0c42, 0x302a, 0x425a, 0x5412, 0x1e72,
- 0x1e42, 0x422a, 0x545a, 0x0c12, 0x3072,
- 0x3042, 0x542a, 0x0c5a, 0x1e12, 0x4272,
- 0x4244, 0x0c2c, 0x1e5c, 0x3014, 0x5474,
- 0x5444, 0x1e2c, 0x305c, 0x4214, 0x0c74,
- 0x0c44, 0x302c, 0x425c, 0x5414, 0x1e74,
- 0x1e44, 0x422c, 0x545c, 0x0c14, 0x3074,
- 0x3044, 0x542c, 0x0c5c, 0x1e14, 0x4274,
- 0x4246, 0x0c2e, 0x1e5e, 0x3016, 0x5476,
- 0x5446, 0x1e2e, 0x305e, 0x4216, 0x0c76,
- 0x0c46, 0x302e, 0x425e, 0x5416, 0x1e76,
- 0x1e46, 0x422e, 0x545e, 0x0c16, 0x3076,
- 0x3046, 0x542e, 0x0c5e, 0x1e16, 0x4276,
- 0x443c, 0x0e24, 0x2054, 0x320c, 0x566c,
- 0x563c, 0x2024, 0x3254, 0x440c, 0x0e6c,
- 0x0e3c, 0x3224, 0x4454, 0x560c, 0x206c,
- 0x203c, 0x4424, 0x5654, 0x0e0c, 0x326c,
- 0x323c, 0x5624, 0x0e54, 0x200c, 0x446c,
- 0x443e, 0x0e26, 0x2056, 0x320e, 0x566e,
- 0x563e, 0x2026, 0x3256, 0x440e, 0x0e6e,
- 0x0e3e, 0x3226, 0x4456, 0x560e, 0x206e,
- 0x203e, 0x4426, 0x5656, 0x0e0e, 0x326e,
- 0x323e, 0x5626, 0x0e56, 0x200e, 0x446e,
- 0x4440, 0x0e28, 0x2058, 0x3210, 0x5670,
- 0x5640, 0x2028, 0x3258, 0x4410, 0x0e70,
- 0x0e40, 0x3228, 0x4458, 0x5610, 0x2070,
- 0x2040, 0x4428, 0x5658, 0x0e10, 0x3270,
- 0x3240, 0x5628, 0x0e58, 0x2010, 0x4470,
- 0x4442, 0x0e2a, 0x205a, 0x3212, 0x5672,
- 0x5642, 0x202a, 0x325a, 0x4412, 0x0e72,
- 0x0e42, 0x322a, 0x445a, 0x5612, 0x2072,
- 0x2042, 0x442a, 0x565a, 0x0e12, 0x3272,
- 0x3242, 0x562a, 0x0e5a, 0x2012, 0x4472,
- 0x4444, 0x0e2c, 0x205c, 0x3214, 0x5674,
- 0x5644, 0x202c, 0x325c, 0x4414, 0x0e74,
- 0x0e44, 0x322c, 0x445c, 0x5614, 0x2074,
- 0x2044, 0x442c, 0x565c, 0x0e14, 0x3274,
- 0x3244, 0x562c, 0x0e5c, 0x2014, 0x4474,
- 0x4446, 0x0e2e, 0x205e, 0x3216, 0x5676,
- 0x5646, 0x202e, 0x325e, 0x4416, 0x0e76,
- 0x0e46, 0x322e, 0x445e, 0x5616, 0x2076,
- 0x2046, 0x442e, 0x565e, 0x0e16, 0x3276,
- 0x3246, 0x562e, 0x0e5e, 0x2016, 0x4476,
- 0x463c, 0x1024, 0x2254, 0x340c, 0x586c,
- 0x583c, 0x2224, 0x3454, 0x460c, 0x106c,
- 0x103c, 0x3424, 0x4654, 0x580c, 0x226c,
- 0x223c, 0x4624, 0x5854, 0x100c, 0x346c,
- 0x343c, 0x5824, 0x1054, 0x220c, 0x466c,
- 0x463e, 0x1026, 0x2256, 0x340e, 0x586e,
- 0x583e, 0x2226, 0x3456, 0x460e, 0x106e,
- 0x103e, 0x3426, 0x4656, 0x580e, 0x226e,
- 0x223e, 0x4626, 0x5856, 0x100e, 0x346e,
- 0x343e, 0x5826, 0x1056, 0x220e, 0x466e,
- 0x4640, 0x1028, 0x2258, 0x3410, 0x5870,
- 0x5840, 0x2228, 0x3458, 0x4610, 0x1070,
- 0x1040, 0x3428, 0x4658, 0x5810, 0x2270,
- 0x2240, 0x4628, 0x5858, 0x1010, 0x3470,
- 0x3440, 0x5828, 0x1058, 0x2210, 0x4670,
- 0x4642, 0x102a, 0x225a, 0x3412, 0x5872,
- 0x5842, 0x222a, 0x345a, 0x4612, 0x1072,
- 0x1042, 0x342a, 0x465a, 0x5812, 0x2272,
- 0x2242, 0x462a, 0x585a, 0x1012, 0x3472,
- 0x3442, 0x582a, 0x105a, 0x2212, 0x4672,
- 0x4644, 0x102c, 0x225c, 0x3414, 0x5874,
- 0x5844, 0x222c, 0x345c, 0x4614, 0x1074,
- 0x1044, 0x342c, 0x465c, 0x5814, 0x2274,
- 0x2244, 0x462c, 0x585c, 0x1014, 0x3474,
- 0x3444, 0x582c, 0x105c, 0x2214, 0x4674,
- 0x4646, 0x102e, 0x225e, 0x3416, 0x5876,
- 0x5846, 0x222e, 0x345e, 0x4616, 0x1076,
- 0x1046, 0x342e, 0x465e, 0x5816, 0x2276,
- 0x2246, 0x462e, 0x585e, 0x1016, 0x3476,
- 0x3446, 0x582e, 0x105e, 0x2216, 0x4676,
-};
-
-static const uint16_t dv_place_720p50[2*12*27*5] = {
- 0x1230, 0x3618, 0x4848, 0x0000, 0x2460,
- 0x2430, 0x4818, 0x0048, 0x1200, 0x3660,
- 0x3630, 0x0018, 0x1248, 0x2400, 0x4860,
- 0x4830, 0x1218, 0x2448, 0x3600, 0x0060,
- 0x0030, 0x2418, 0x3648, 0x4800, 0x1260,
- 0x1232, 0x361a, 0x484a, 0x0002, 0x2462,
- 0x2432, 0x481a, 0x004a, 0x1202, 0x3662,
- 0x3632, 0x001a, 0x124a, 0x2402, 0x4862,
- 0x4832, 0x121a, 0x244a, 0x3602, 0x0062,
- 0x0032, 0x241a, 0x364a, 0x4802, 0x1262,
- 0x1234, 0x361c, 0x484c, 0x0004, 0x2464,
- 0x2434, 0x481c, 0x004c, 0x1204, 0x3664,
- 0x3634, 0x001c, 0x124c, 0x2404, 0x4864,
- 0x4834, 0x121c, 0x244c, 0x3604, 0x0064,
- 0x0034, 0x241c, 0x364c, 0x4804, 0x1264,
- 0x1236, 0x361e, 0x484e, 0x0006, 0x2466,
- 0x2436, 0x481e, 0x004e, 0x1206, 0x3666,
- 0x3636, 0x001e, 0x124e, 0x2406, 0x4866,
- 0x4836, 0x121e, 0x244e, 0x3606, 0x0066,
- 0x0036, 0x241e, 0x364e, 0x4806, 0x1266,
- 0x1238, 0x3620, 0x4850, 0x0008, 0x2468,
- 0x2438, 0x4820, 0x0050, 0x1208, 0x3668,
- 0x3638, 0x0020, 0x1250, 0x2408, 0x4868,
- 0x4838, 0x1220, 0x2450, 0x3608, 0x0068,
- 0x0038, 0x2420, 0x3650, 0x4808, 0x1268,
- 0x123a, 0x3622, 0x4852, 0x000a, 0x246a,
- 0x243a, 0x4822, 0x0052, 0x120a, 0x366a,
- 0x363a, 0x0022, 0x1252, 0x240a, 0x486a,
- 0x483a, 0x1222, 0x2452, 0x360a, 0x006a,
- 0x003a, 0x2422, 0x3652, 0x480a, 0x126a,
- 0x1430, 0x3818, 0x4a48, 0x0200, 0x2660,
- 0x2630, 0x4a18, 0x0248, 0x1400, 0x3860,
- 0x3830, 0x0218, 0x1448, 0x2600, 0x4a60,
- 0x4a30, 0x1418, 0x2648, 0x3800, 0x0260,
- 0x0230, 0x2618, 0x3848, 0x4a00, 0x1460,
- 0x1432, 0x381a, 0x4a4a, 0x0202, 0x2662,
- 0x2632, 0x4a1a, 0x024a, 0x1402, 0x3862,
- 0x3832, 0x021a, 0x144a, 0x2602, 0x4a62,
- 0x4a32, 0x141a, 0x264a, 0x3802, 0x0262,
- 0x0232, 0x261a, 0x384a, 0x4a02, 0x1462,
- 0x1434, 0x381c, 0x4a4c, 0x0204, 0x2664,
- 0x2634, 0x4a1c, 0x024c, 0x1404, 0x3864,
- 0x3834, 0x021c, 0x144c, 0x2604, 0x4a64,
- 0x4a34, 0x141c, 0x264c, 0x3804, 0x0264,
- 0x0234, 0x261c, 0x384c, 0x4a04, 0x1464,
- 0x1436, 0x381e, 0x4a4e, 0x0206, 0x2666,
- 0x2636, 0x4a1e, 0x024e, 0x1406, 0x3866,
- 0x3836, 0x021e, 0x144e, 0x2606, 0x4a66,
- 0x4a36, 0x141e, 0x264e, 0x3806, 0x0266,
- 0x0236, 0x261e, 0x384e, 0x4a06, 0x1466,
- 0x1438, 0x3820, 0x4a50, 0x0208, 0x2668,
- 0x2638, 0x4a20, 0x0250, 0x1408, 0x3868,
- 0x3838, 0x0220, 0x1450, 0x2608, 0x4a68,
- 0x4a38, 0x1420, 0x2650, 0x3808, 0x0268,
- 0x0238, 0x2620, 0x3850, 0x4a08, 0x1468,
- 0x143a, 0x3822, 0x4a52, 0x020a, 0x266a,
- 0x263a, 0x4a22, 0x0252, 0x140a, 0x386a,
- 0x383a, 0x0222, 0x1452, 0x260a, 0x4a6a,
- 0x4a3a, 0x1422, 0x2652, 0x380a, 0x026a,
- 0x023a, 0x2622, 0x3852, 0x4a0a, 0x146a,
- 0x1630, 0x3a18, 0x4c48, 0x0400, 0x2860,
- 0x2830, 0x4c18, 0x0448, 0x1600, 0x3a60,
- 0x3a30, 0x0418, 0x1648, 0x2800, 0x4c60,
- 0x4c30, 0x1618, 0x2848, 0x3a00, 0x0460,
- 0x0430, 0x2818, 0x3a48, 0x4c00, 0x1660,
- 0x1632, 0x3a1a, 0x4c4a, 0x0402, 0x2862,
- 0x2832, 0x4c1a, 0x044a, 0x1602, 0x3a62,
- 0x3a32, 0x041a, 0x164a, 0x2802, 0x4c62,
- 0x4c32, 0x161a, 0x284a, 0x3a02, 0x0462,
- 0x0432, 0x281a, 0x3a4a, 0x4c02, 0x1662,
- 0x1634, 0x3a1c, 0x4c4c, 0x0404, 0x2864,
- 0x2834, 0x4c1c, 0x044c, 0x1604, 0x3a64,
- 0x3a34, 0x041c, 0x164c, 0x2804, 0x4c64,
- 0x4c34, 0x161c, 0x284c, 0x3a04, 0x0464,
- 0x0434, 0x281c, 0x3a4c, 0x4c04, 0x1664,
- 0x1636, 0x3a1e, 0x4c4e, 0x0406, 0x2866,
- 0x2836, 0x4c1e, 0x044e, 0x1606, 0x3a66,
- 0x3a36, 0x041e, 0x164e, 0x2806, 0x4c66,
- 0x4c36, 0x161e, 0x284e, 0x3a06, 0x0466,
- 0x0436, 0x281e, 0x3a4e, 0x4c06, 0x1666,
- 0x1638, 0x3a20, 0x4c50, 0x0408, 0x2868,
- 0x2838, 0x4c20, 0x0450, 0x1608, 0x3a68,
- 0x3a38, 0x0420, 0x1650, 0x2808, 0x4c68,
- 0x4c38, 0x1620, 0x2850, 0x3a08, 0x0468,
- 0x0438, 0x2820, 0x3a50, 0x4c08, 0x1668,
- 0x163a, 0x3a22, 0x4c52, 0x040a, 0x286a,
- 0x283a, 0x4c22, 0x0452, 0x160a, 0x3a6a,
- 0x3a3a, 0x0422, 0x1652, 0x280a, 0x4c6a,
- 0x4c3a, 0x1622, 0x2852, 0x3a0a, 0x046a,
- 0x043a, 0x2822, 0x3a52, 0x4c0a, 0x166a,
- 0x1830, 0x3c18, 0x4e48, 0x0600, 0x2a60,
- 0x2a30, 0x4e18, 0x0648, 0x1800, 0x3c60,
- 0x3c30, 0x0618, 0x1848, 0x2a00, 0x4e60,
- 0x4e30, 0x1818, 0x2a48, 0x3c00, 0x0660,
- 0x0630, 0x2a18, 0x3c48, 0x4e00, 0x1860,
- 0x1832, 0x3c1a, 0x4e4a, 0x0602, 0x2a62,
- 0x2a32, 0x4e1a, 0x064a, 0x1802, 0x3c62,
- 0x3c32, 0x061a, 0x184a, 0x2a02, 0x4e62,
- 0x4e32, 0x181a, 0x2a4a, 0x3c02, 0x0662,
- 0x0632, 0x2a1a, 0x3c4a, 0x4e02, 0x1862,
- 0x1834, 0x3c1c, 0x4e4c, 0x0604, 0x2a64,
- 0x2a34, 0x4e1c, 0x064c, 0x1804, 0x3c64,
- 0x3c34, 0x061c, 0x184c, 0x2a04, 0x4e64,
- 0x4e34, 0x181c, 0x2a4c, 0x3c04, 0x0664,
- 0x0634, 0x2a1c, 0x3c4c, 0x4e04, 0x1864,
- 0x1836, 0x3c1e, 0x4e4e, 0x0606, 0x2a66,
- 0x2a36, 0x4e1e, 0x064e, 0x1806, 0x3c66,
- 0x3c36, 0x061e, 0x184e, 0x2a06, 0x4e66,
- 0x4e36, 0x181e, 0x2a4e, 0x3c06, 0x0666,
- 0x0636, 0x2a1e, 0x3c4e, 0x4e06, 0x1866,
- 0x1838, 0x3c20, 0x4e50, 0x0608, 0x2a68,
- 0x2a38, 0x4e20, 0x0650, 0x1808, 0x3c68,
- 0x3c38, 0x0620, 0x1850, 0x2a08, 0x4e68,
- 0x4e38, 0x1820, 0x2a50, 0x3c08, 0x0668,
- 0x0638, 0x2a20, 0x3c50, 0x4e08, 0x1868,
- 0x183a, 0x3c22, 0x4e52, 0x060a, 0x2a6a,
- 0x2a3a, 0x4e22, 0x0652, 0x180a, 0x3c6a,
- 0x3c3a, 0x0622, 0x1852, 0x2a0a, 0x4e6a,
- 0x4e3a, 0x1822, 0x2a52, 0x3c0a, 0x066a,
- 0x063a, 0x2a22, 0x3c52, 0x4e0a, 0x186a,
- 0x1a30, 0x3e18, 0x5048, 0x0800, 0x2c60,
- 0x2c30, 0x5018, 0x0848, 0x1a00, 0x3e60,
- 0x3e30, 0x0818, 0x1a48, 0x2c00, 0x5060,
- 0x5030, 0x1a18, 0x2c48, 0x3e00, 0x0860,
- 0x0830, 0x2c18, 0x3e48, 0x5000, 0x1a60,
- 0x1a32, 0x3e1a, 0x504a, 0x0802, 0x2c62,
- 0x2c32, 0x501a, 0x084a, 0x1a02, 0x3e62,
- 0x3e32, 0x081a, 0x1a4a, 0x2c02, 0x5062,
- 0x5032, 0x1a1a, 0x2c4a, 0x3e02, 0x0862,
- 0x0832, 0x2c1a, 0x3e4a, 0x5002, 0x1a62,
- 0x1a34, 0x3e1c, 0x504c, 0x0804, 0x2c64,
- 0x2c34, 0x501c, 0x084c, 0x1a04, 0x3e64,
- 0x3e34, 0x081c, 0x1a4c, 0x2c04, 0x5064,
- 0x5034, 0x1a1c, 0x2c4c, 0x3e04, 0x0864,
- 0x0834, 0x2c1c, 0x3e4c, 0x5004, 0x1a64,
- 0x1a36, 0x3e1e, 0x504e, 0x0806, 0x2c66,
- 0x2c36, 0x501e, 0x084e, 0x1a06, 0x3e66,
- 0x3e36, 0x081e, 0x1a4e, 0x2c06, 0x5066,
- 0x5036, 0x1a1e, 0x2c4e, 0x3e06, 0x0866,
- 0x0836, 0x2c1e, 0x3e4e, 0x5006, 0x1a66,
- 0x1a38, 0x3e20, 0x5050, 0x0808, 0x2c68,
- 0x2c38, 0x5020, 0x0850, 0x1a08, 0x3e68,
- 0x3e38, 0x0820, 0x1a50, 0x2c08, 0x5068,
- 0x5038, 0x1a20, 0x2c50, 0x3e08, 0x0868,
- 0x0838, 0x2c20, 0x3e50, 0x5008, 0x1a68,
- 0x1a3a, 0x3e22, 0x5052, 0x080a, 0x2c6a,
- 0x2c3a, 0x5022, 0x0852, 0x1a0a, 0x3e6a,
- 0x3e3a, 0x0822, 0x1a52, 0x2c0a, 0x506a,
- 0x503a, 0x1a22, 0x2c52, 0x3e0a, 0x086a,
- 0x083a, 0x2c22, 0x3e52, 0x500a, 0x1a6a,
- 0x1c30, 0x4018, 0x5248, 0x0a00, 0x2e60,
- 0x2e30, 0x5218, 0x0a48, 0x1c00, 0x4060,
- 0x4030, 0x0a18, 0x1c48, 0x2e00, 0x5260,
- 0x5230, 0x1c18, 0x2e48, 0x4000, 0x0a60,
- 0x0a30, 0x2e18, 0x4048, 0x5200, 0x1c60,
- 0x1c32, 0x401a, 0x524a, 0x0a02, 0x2e62,
- 0x2e32, 0x521a, 0x0a4a, 0x1c02, 0x4062,
- 0x4032, 0x0a1a, 0x1c4a, 0x2e02, 0x5262,
- 0x5232, 0x1c1a, 0x2e4a, 0x4002, 0x0a62,
- 0x0a32, 0x2e1a, 0x404a, 0x5202, 0x1c62,
- 0x1c34, 0x401c, 0x524c, 0x0a04, 0x2e64,
- 0x2e34, 0x521c, 0x0a4c, 0x1c04, 0x4064,
- 0x4034, 0x0a1c, 0x1c4c, 0x2e04, 0x5264,
- 0x5234, 0x1c1c, 0x2e4c, 0x4004, 0x0a64,
- 0x0a34, 0x2e1c, 0x404c, 0x5204, 0x1c64,
- 0x1c36, 0x401e, 0x524e, 0x0a06, 0x2e66,
- 0x2e36, 0x521e, 0x0a4e, 0x1c06, 0x4066,
- 0x4036, 0x0a1e, 0x1c4e, 0x2e06, 0x5266,
- 0x5236, 0x1c1e, 0x2e4e, 0x4006, 0x0a66,
- 0x0a36, 0x2e1e, 0x404e, 0x5206, 0x1c66,
- 0x1c38, 0x4020, 0x5250, 0x0a08, 0x2e68,
- 0x2e38, 0x5220, 0x0a50, 0x1c08, 0x4068,
- 0x4038, 0x0a20, 0x1c50, 0x2e08, 0x5268,
- 0x5238, 0x1c20, 0x2e50, 0x4008, 0x0a68,
- 0x0a38, 0x2e20, 0x4050, 0x5208, 0x1c68,
- 0x1c3a, 0x4022, 0x5252, 0x0a0a, 0x2e6a,
- 0x2e3a, 0x5222, 0x0a52, 0x1c0a, 0x406a,
- 0x403a, 0x0a22, 0x1c52, 0x2e0a, 0x526a,
- 0x523a, 0x1c22, 0x2e52, 0x400a, 0x0a6a,
- 0x0a3a, 0x2e22, 0x4052, 0x520a, 0x1c6a,
- 0x1e30, 0x4218, 0x5448, 0x0c00, 0x3060,
- 0x3030, 0x5418, 0x0c48, 0x1e00, 0x4260,
- 0x4230, 0x0c18, 0x1e48, 0x3000, 0x5460,
- 0x5430, 0x1e18, 0x3048, 0x4200, 0x0c60,
- 0x0c30, 0x3018, 0x4248, 0x5400, 0x1e60,
- 0x1e32, 0x421a, 0x544a, 0x0c02, 0x3062,
- 0x3032, 0x541a, 0x0c4a, 0x1e02, 0x4262,
- 0x4232, 0x0c1a, 0x1e4a, 0x3002, 0x5462,
- 0x5432, 0x1e1a, 0x304a, 0x4202, 0x0c62,
- 0x0c32, 0x301a, 0x424a, 0x5402, 0x1e62,
- 0x1e34, 0x421c, 0x544c, 0x0c04, 0x3064,
- 0x3034, 0x541c, 0x0c4c, 0x1e04, 0x4264,
- 0x4234, 0x0c1c, 0x1e4c, 0x3004, 0x5464,
- 0x5434, 0x1e1c, 0x304c, 0x4204, 0x0c64,
- 0x0c34, 0x301c, 0x424c, 0x5404, 0x1e64,
- 0x1e36, 0x421e, 0x544e, 0x0c06, 0x3066,
- 0x3036, 0x541e, 0x0c4e, 0x1e06, 0x4266,
- 0x4236, 0x0c1e, 0x1e4e, 0x3006, 0x5466,
- 0x5436, 0x1e1e, 0x304e, 0x4206, 0x0c66,
- 0x0c36, 0x301e, 0x424e, 0x5406, 0x1e66,
- 0x1e38, 0x4220, 0x5450, 0x0c08, 0x3068,
- 0x3038, 0x5420, 0x0c50, 0x1e08, 0x4268,
- 0x4238, 0x0c20, 0x1e50, 0x3008, 0x5468,
- 0x5438, 0x1e20, 0x3050, 0x4208, 0x0c68,
- 0x0c38, 0x3020, 0x4250, 0x5408, 0x1e68,
- 0x1e3a, 0x4222, 0x5452, 0x0c0a, 0x306a,
- 0x303a, 0x5422, 0x0c52, 0x1e0a, 0x426a,
- 0x423a, 0x0c22, 0x1e52, 0x300a, 0x546a,
- 0x543a, 0x1e22, 0x3052, 0x420a, 0x0c6a,
- 0x0c3a, 0x3022, 0x4252, 0x540a, 0x1e6a,
- 0x2030, 0x4418, 0x5648, 0x0e00, 0x3260,
- 0x3230, 0x5618, 0x0e48, 0x2000, 0x4460,
- 0x4430, 0x0e18, 0x2048, 0x3200, 0x5660,
- 0x5630, 0x2018, 0x3248, 0x4400, 0x0e60,
- 0x0e30, 0x3218, 0x4448, 0x5600, 0x2060,
- 0x2032, 0x441a, 0x564a, 0x0e02, 0x3262,
- 0x3232, 0x561a, 0x0e4a, 0x2002, 0x4462,
- 0x4432, 0x0e1a, 0x204a, 0x3202, 0x5662,
- 0x5632, 0x201a, 0x324a, 0x4402, 0x0e62,
- 0x0e32, 0x321a, 0x444a, 0x5602, 0x2062,
- 0x2034, 0x441c, 0x564c, 0x0e04, 0x3264,
- 0x3234, 0x561c, 0x0e4c, 0x2004, 0x4464,
- 0x4434, 0x0e1c, 0x204c, 0x3204, 0x5664,
- 0x5634, 0x201c, 0x324c, 0x4404, 0x0e64,
- 0x0e34, 0x321c, 0x444c, 0x5604, 0x2064,
- 0x2036, 0x441e, 0x564e, 0x0e06, 0x3266,
- 0x3236, 0x561e, 0x0e4e, 0x2006, 0x4466,
- 0x4436, 0x0e1e, 0x204e, 0x3206, 0x5666,
- 0x5636, 0x201e, 0x324e, 0x4406, 0x0e66,
- 0x0e36, 0x321e, 0x444e, 0x5606, 0x2066,
- 0x2038, 0x4420, 0x5650, 0x0e08, 0x3268,
- 0x3238, 0x5620, 0x0e50, 0x2008, 0x4468,
- 0x4438, 0x0e20, 0x2050, 0x3208, 0x5668,
- 0x5638, 0x2020, 0x3250, 0x4408, 0x0e68,
- 0x0e38, 0x3220, 0x4450, 0x5608, 0x2068,
- 0x203a, 0x4422, 0x5652, 0x0e0a, 0x326a,
- 0x323a, 0x5622, 0x0e52, 0x200a, 0x446a,
- 0x443a, 0x0e22, 0x2052, 0x320a, 0x566a,
- 0x563a, 0x2022, 0x3252, 0x440a, 0x0e6a,
- 0x0e3a, 0x3222, 0x4452, 0x560a, 0x206a,
- 0x2230, 0x4618, 0x5848, 0x1000, 0x3460,
- 0x3430, 0x5818, 0x1048, 0x2200, 0x4660,
- 0x4630, 0x1018, 0x2248, 0x3400, 0x5860,
- 0x5830, 0x2218, 0x3448, 0x4600, 0x1060,
- 0x1030, 0x3418, 0x4648, 0x5800, 0x2260,
- 0x2232, 0x461a, 0x584a, 0x1002, 0x3462,
- 0x3432, 0x581a, 0x104a, 0x2202, 0x4662,
- 0x4632, 0x101a, 0x224a, 0x3402, 0x5862,
- 0x5832, 0x221a, 0x344a, 0x4602, 0x1062,
- 0x1032, 0x341a, 0x464a, 0x5802, 0x2262,
- 0x2234, 0x461c, 0x584c, 0x1004, 0x3464,
- 0x3434, 0x581c, 0x104c, 0x2204, 0x4664,
- 0x4634, 0x101c, 0x224c, 0x3404, 0x5864,
- 0x5834, 0x221c, 0x344c, 0x4604, 0x1064,
- 0x1034, 0x341c, 0x464c, 0x5804, 0x2264,
- 0x2236, 0x461e, 0x584e, 0x1006, 0x3466,
- 0x3436, 0x581e, 0x104e, 0x2206, 0x4666,
- 0x4636, 0x101e, 0x224e, 0x3406, 0x5866,
- 0x5836, 0x221e, 0x344e, 0x4606, 0x1066,
- 0x1036, 0x341e, 0x464e, 0x5806, 0x2266,
- 0x2238, 0x4620, 0x5850, 0x1008, 0x3468,
- 0x3438, 0x5820, 0x1050, 0x2208, 0x4668,
- 0x4638, 0x1020, 0x2250, 0x3408, 0x5868,
- 0x5838, 0x2220, 0x3450, 0x4608, 0x1068,
- 0x1038, 0x3420, 0x4650, 0x5808, 0x2268,
- 0x223a, 0x4622, 0x5852, 0x100a, 0x346a,
- 0x343a, 0x5822, 0x1052, 0x220a, 0x466a,
- 0x463a, 0x1022, 0x2252, 0x340a, 0x586a,
- 0x583a, 0x2222, 0x3452, 0x460a, 0x106a,
- 0x103a, 0x3422, 0x4652, 0x580a, 0x226a,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x363c, 0x0024, 0x1254, 0x240c, 0x486c,
- 0x483c, 0x1224, 0x2454, 0x360c, 0x006c,
- 0x003c, 0x2424, 0x3654, 0x480c, 0x126c,
- 0x123c, 0x3624, 0x4854, 0x000c, 0x246c,
- 0x243c, 0x4824, 0x0054, 0x120c, 0x366c,
- 0x363e, 0x0026, 0x1256, 0x240e, 0x486e,
- 0x483e, 0x1226, 0x2456, 0x360e, 0x006e,
- 0x003e, 0x2426, 0x3656, 0x480e, 0x126e,
- 0x123e, 0x3626, 0x4856, 0x000e, 0x246e,
- 0x243e, 0x4826, 0x0056, 0x120e, 0x366e,
- 0x3640, 0x0028, 0x1258, 0x2410, 0x4870,
- 0x4840, 0x1228, 0x2458, 0x3610, 0x0070,
- 0x0040, 0x2428, 0x3658, 0x4810, 0x1270,
- 0x1240, 0x3628, 0x4858, 0x0010, 0x2470,
- 0x2440, 0x4828, 0x0058, 0x1210, 0x3670,
- 0x3642, 0x002a, 0x125a, 0x2412, 0x4872,
- 0x4842, 0x122a, 0x245a, 0x3612, 0x0072,
- 0x0042, 0x242a, 0x365a, 0x4812, 0x1272,
- 0x1242, 0x362a, 0x485a, 0x0012, 0x2472,
- 0x2442, 0x482a, 0x005a, 0x1212, 0x3672,
- 0x3644, 0x002c, 0x125c, 0x2414, 0x4874,
- 0x4844, 0x122c, 0x245c, 0x3614, 0x0074,
- 0x0044, 0x242c, 0x365c, 0x4814, 0x1274,
- 0x1244, 0x362c, 0x485c, 0x0014, 0x2474,
- 0x2444, 0x482c, 0x005c, 0x1214, 0x3674,
- 0x3646, 0x002e, 0x125e, 0x2416, 0x4876,
- 0x4846, 0x122e, 0x245e, 0x3616, 0x0076,
- 0x0046, 0x242e, 0x365e, 0x4816, 0x1276,
- 0x1246, 0x362e, 0x485e, 0x0016, 0x2476,
- 0x2446, 0x482e, 0x005e, 0x1216, 0x3676,
- 0x383c, 0x0224, 0x1454, 0x260c, 0x4a6c,
- 0x4a3c, 0x1424, 0x2654, 0x380c, 0x026c,
- 0x023c, 0x2624, 0x3854, 0x4a0c, 0x146c,
- 0x143c, 0x3824, 0x4a54, 0x020c, 0x266c,
- 0x263c, 0x4a24, 0x0254, 0x140c, 0x386c,
- 0x383e, 0x0226, 0x1456, 0x260e, 0x4a6e,
- 0x4a3e, 0x1426, 0x2656, 0x380e, 0x026e,
- 0x023e, 0x2626, 0x3856, 0x4a0e, 0x146e,
- 0x143e, 0x3826, 0x4a56, 0x020e, 0x266e,
- 0x263e, 0x4a26, 0x0256, 0x140e, 0x386e,
- 0x3840, 0x0228, 0x1458, 0x2610, 0x4a70,
- 0x4a40, 0x1428, 0x2658, 0x3810, 0x0270,
- 0x0240, 0x2628, 0x3858, 0x4a10, 0x1470,
- 0x1440, 0x3828, 0x4a58, 0x0210, 0x2670,
- 0x2640, 0x4a28, 0x0258, 0x1410, 0x3870,
- 0x3842, 0x022a, 0x145a, 0x2612, 0x4a72,
- 0x4a42, 0x142a, 0x265a, 0x3812, 0x0272,
- 0x0242, 0x262a, 0x385a, 0x4a12, 0x1472,
- 0x1442, 0x382a, 0x4a5a, 0x0212, 0x2672,
- 0x2642, 0x4a2a, 0x025a, 0x1412, 0x3872,
- 0x3844, 0x022c, 0x145c, 0x2614, 0x4a74,
- 0x4a44, 0x142c, 0x265c, 0x3814, 0x0274,
- 0x0244, 0x262c, 0x385c, 0x4a14, 0x1474,
- 0x1444, 0x382c, 0x4a5c, 0x0214, 0x2674,
- 0x2644, 0x4a2c, 0x025c, 0x1414, 0x3874,
- 0x3846, 0x022e, 0x145e, 0x2616, 0x4a76,
- 0x4a46, 0x142e, 0x265e, 0x3816, 0x0276,
- 0x0246, 0x262e, 0x385e, 0x4a16, 0x1476,
- 0x1446, 0x382e, 0x4a5e, 0x0216, 0x2676,
- 0x2646, 0x4a2e, 0x025e, 0x1416, 0x3876,
- 0x3a3c, 0x0424, 0x1654, 0x280c, 0x4c6c,
- 0x4c3c, 0x1624, 0x2854, 0x3a0c, 0x046c,
- 0x043c, 0x2824, 0x3a54, 0x4c0c, 0x166c,
- 0x163c, 0x3a24, 0x4c54, 0x040c, 0x286c,
- 0x283c, 0x4c24, 0x0454, 0x160c, 0x3a6c,
- 0x3a3e, 0x0426, 0x1656, 0x280e, 0x4c6e,
- 0x4c3e, 0x1626, 0x2856, 0x3a0e, 0x046e,
- 0x043e, 0x2826, 0x3a56, 0x4c0e, 0x166e,
- 0x163e, 0x3a26, 0x4c56, 0x040e, 0x286e,
- 0x283e, 0x4c26, 0x0456, 0x160e, 0x3a6e,
- 0x3a40, 0x0428, 0x1658, 0x2810, 0x4c70,
- 0x4c40, 0x1628, 0x2858, 0x3a10, 0x0470,
- 0x0440, 0x2828, 0x3a58, 0x4c10, 0x1670,
- 0x1640, 0x3a28, 0x4c58, 0x0410, 0x2870,
- 0x2840, 0x4c28, 0x0458, 0x1610, 0x3a70,
- 0x3a42, 0x042a, 0x165a, 0x2812, 0x4c72,
- 0x4c42, 0x162a, 0x285a, 0x3a12, 0x0472,
- 0x0442, 0x282a, 0x3a5a, 0x4c12, 0x1672,
- 0x1642, 0x3a2a, 0x4c5a, 0x0412, 0x2872,
- 0x2842, 0x4c2a, 0x045a, 0x1612, 0x3a72,
- 0x3a44, 0x042c, 0x165c, 0x2814, 0x4c74,
- 0x4c44, 0x162c, 0x285c, 0x3a14, 0x0474,
- 0x0444, 0x282c, 0x3a5c, 0x4c14, 0x1674,
- 0x1644, 0x3a2c, 0x4c5c, 0x0414, 0x2874,
- 0x2844, 0x4c2c, 0x045c, 0x1614, 0x3a74,
- 0x3a46, 0x042e, 0x165e, 0x2816, 0x4c76,
- 0x4c46, 0x162e, 0x285e, 0x3a16, 0x0476,
- 0x0446, 0x282e, 0x3a5e, 0x4c16, 0x1676,
- 0x1646, 0x3a2e, 0x4c5e, 0x0416, 0x2876,
- 0x2846, 0x4c2e, 0x045e, 0x1616, 0x3a76,
- 0x3c3c, 0x0624, 0x1854, 0x2a0c, 0x4e6c,
- 0x4e3c, 0x1824, 0x2a54, 0x3c0c, 0x066c,
- 0x063c, 0x2a24, 0x3c54, 0x4e0c, 0x186c,
- 0x183c, 0x3c24, 0x4e54, 0x060c, 0x2a6c,
- 0x2a3c, 0x4e24, 0x0654, 0x180c, 0x3c6c,
- 0x3c3e, 0x0626, 0x1856, 0x2a0e, 0x4e6e,
- 0x4e3e, 0x1826, 0x2a56, 0x3c0e, 0x066e,
- 0x063e, 0x2a26, 0x3c56, 0x4e0e, 0x186e,
- 0x183e, 0x3c26, 0x4e56, 0x060e, 0x2a6e,
- 0x2a3e, 0x4e26, 0x0656, 0x180e, 0x3c6e,
- 0x3c40, 0x0628, 0x1858, 0x2a10, 0x4e70,
- 0x4e40, 0x1828, 0x2a58, 0x3c10, 0x0670,
- 0x0640, 0x2a28, 0x3c58, 0x4e10, 0x1870,
- 0x1840, 0x3c28, 0x4e58, 0x0610, 0x2a70,
- 0x2a40, 0x4e28, 0x0658, 0x1810, 0x3c70,
- 0x3c42, 0x062a, 0x185a, 0x2a12, 0x4e72,
- 0x4e42, 0x182a, 0x2a5a, 0x3c12, 0x0672,
- 0x0642, 0x2a2a, 0x3c5a, 0x4e12, 0x1872,
- 0x1842, 0x3c2a, 0x4e5a, 0x0612, 0x2a72,
- 0x2a42, 0x4e2a, 0x065a, 0x1812, 0x3c72,
- 0x3c44, 0x062c, 0x185c, 0x2a14, 0x4e74,
- 0x4e44, 0x182c, 0x2a5c, 0x3c14, 0x0674,
- 0x0644, 0x2a2c, 0x3c5c, 0x4e14, 0x1874,
- 0x1844, 0x3c2c, 0x4e5c, 0x0614, 0x2a74,
- 0x2a44, 0x4e2c, 0x065c, 0x1814, 0x3c74,
- 0x3c46, 0x062e, 0x185e, 0x2a16, 0x4e76,
- 0x4e46, 0x182e, 0x2a5e, 0x3c16, 0x0676,
- 0x0646, 0x2a2e, 0x3c5e, 0x4e16, 0x1876,
- 0x1846, 0x3c2e, 0x4e5e, 0x0616, 0x2a76,
- 0x2a46, 0x4e2e, 0x065e, 0x1816, 0x3c76,
- 0x3e3c, 0x0824, 0x1a54, 0x2c0c, 0x506c,
- 0x503c, 0x1a24, 0x2c54, 0x3e0c, 0x086c,
- 0x083c, 0x2c24, 0x3e54, 0x500c, 0x1a6c,
- 0x1a3c, 0x3e24, 0x5054, 0x080c, 0x2c6c,
- 0x2c3c, 0x5024, 0x0854, 0x1a0c, 0x3e6c,
- 0x3e3e, 0x0826, 0x1a56, 0x2c0e, 0x506e,
- 0x503e, 0x1a26, 0x2c56, 0x3e0e, 0x086e,
- 0x083e, 0x2c26, 0x3e56, 0x500e, 0x1a6e,
- 0x1a3e, 0x3e26, 0x5056, 0x080e, 0x2c6e,
- 0x2c3e, 0x5026, 0x0856, 0x1a0e, 0x3e6e,
- 0x3e40, 0x0828, 0x1a58, 0x2c10, 0x5070,
- 0x5040, 0x1a28, 0x2c58, 0x3e10, 0x0870,
- 0x0840, 0x2c28, 0x3e58, 0x5010, 0x1a70,
- 0x1a40, 0x3e28, 0x5058, 0x0810, 0x2c70,
- 0x2c40, 0x5028, 0x0858, 0x1a10, 0x3e70,
- 0x3e42, 0x082a, 0x1a5a, 0x2c12, 0x5072,
- 0x5042, 0x1a2a, 0x2c5a, 0x3e12, 0x0872,
- 0x0842, 0x2c2a, 0x3e5a, 0x5012, 0x1a72,
- 0x1a42, 0x3e2a, 0x505a, 0x0812, 0x2c72,
- 0x2c42, 0x502a, 0x085a, 0x1a12, 0x3e72,
- 0x3e44, 0x082c, 0x1a5c, 0x2c14, 0x5074,
- 0x5044, 0x1a2c, 0x2c5c, 0x3e14, 0x0874,
- 0x0844, 0x2c2c, 0x3e5c, 0x5014, 0x1a74,
- 0x1a44, 0x3e2c, 0x505c, 0x0814, 0x2c74,
- 0x2c44, 0x502c, 0x085c, 0x1a14, 0x3e74,
- 0x3e46, 0x082e, 0x1a5e, 0x2c16, 0x5076,
- 0x5046, 0x1a2e, 0x2c5e, 0x3e16, 0x0876,
- 0x0846, 0x2c2e, 0x3e5e, 0x5016, 0x1a76,
- 0x1a46, 0x3e2e, 0x505e, 0x0816, 0x2c76,
- 0x2c46, 0x502e, 0x085e, 0x1a16, 0x3e76,
- 0x403c, 0x0a24, 0x1c54, 0x2e0c, 0x526c,
- 0x523c, 0x1c24, 0x2e54, 0x400c, 0x0a6c,
- 0x0a3c, 0x2e24, 0x4054, 0x520c, 0x1c6c,
- 0x1c3c, 0x4024, 0x5254, 0x0a0c, 0x2e6c,
- 0x2e3c, 0x5224, 0x0a54, 0x1c0c, 0x406c,
- 0x403e, 0x0a26, 0x1c56, 0x2e0e, 0x526e,
- 0x523e, 0x1c26, 0x2e56, 0x400e, 0x0a6e,
- 0x0a3e, 0x2e26, 0x4056, 0x520e, 0x1c6e,
- 0x1c3e, 0x4026, 0x5256, 0x0a0e, 0x2e6e,
- 0x2e3e, 0x5226, 0x0a56, 0x1c0e, 0x406e,
- 0x4040, 0x0a28, 0x1c58, 0x2e10, 0x5270,
- 0x5240, 0x1c28, 0x2e58, 0x4010, 0x0a70,
- 0x0a40, 0x2e28, 0x4058, 0x5210, 0x1c70,
- 0x1c40, 0x4028, 0x5258, 0x0a10, 0x2e70,
- 0x2e40, 0x5228, 0x0a58, 0x1c10, 0x4070,
- 0x4042, 0x0a2a, 0x1c5a, 0x2e12, 0x5272,
- 0x5242, 0x1c2a, 0x2e5a, 0x4012, 0x0a72,
- 0x0a42, 0x2e2a, 0x405a, 0x5212, 0x1c72,
- 0x1c42, 0x402a, 0x525a, 0x0a12, 0x2e72,
- 0x2e42, 0x522a, 0x0a5a, 0x1c12, 0x4072,
- 0x4044, 0x0a2c, 0x1c5c, 0x2e14, 0x5274,
- 0x5244, 0x1c2c, 0x2e5c, 0x4014, 0x0a74,
- 0x0a44, 0x2e2c, 0x405c, 0x5214, 0x1c74,
- 0x1c44, 0x402c, 0x525c, 0x0a14, 0x2e74,
- 0x2e44, 0x522c, 0x0a5c, 0x1c14, 0x4074,
- 0x4046, 0x0a2e, 0x1c5e, 0x2e16, 0x5276,
- 0x5246, 0x1c2e, 0x2e5e, 0x4016, 0x0a76,
- 0x0a46, 0x2e2e, 0x405e, 0x5216, 0x1c76,
- 0x1c46, 0x402e, 0x525e, 0x0a16, 0x2e76,
- 0x2e46, 0x522e, 0x0a5e, 0x1c16, 0x4076,
- 0x423c, 0x0c24, 0x1e54, 0x300c, 0x546c,
- 0x543c, 0x1e24, 0x3054, 0x420c, 0x0c6c,
- 0x0c3c, 0x3024, 0x4254, 0x540c, 0x1e6c,
- 0x1e3c, 0x4224, 0x5454, 0x0c0c, 0x306c,
- 0x303c, 0x5424, 0x0c54, 0x1e0c, 0x426c,
- 0x423e, 0x0c26, 0x1e56, 0x300e, 0x546e,
- 0x543e, 0x1e26, 0x3056, 0x420e, 0x0c6e,
- 0x0c3e, 0x3026, 0x4256, 0x540e, 0x1e6e,
- 0x1e3e, 0x4226, 0x5456, 0x0c0e, 0x306e,
- 0x303e, 0x5426, 0x0c56, 0x1e0e, 0x426e,
- 0x4240, 0x0c28, 0x1e58, 0x3010, 0x5470,
- 0x5440, 0x1e28, 0x3058, 0x4210, 0x0c70,
- 0x0c40, 0x3028, 0x4258, 0x5410, 0x1e70,
- 0x1e40, 0x4228, 0x5458, 0x0c10, 0x3070,
- 0x3040, 0x5428, 0x0c58, 0x1e10, 0x4270,
- 0x4242, 0x0c2a, 0x1e5a, 0x3012, 0x5472,
- 0x5442, 0x1e2a, 0x305a, 0x4212, 0x0c72,
- 0x0c42, 0x302a, 0x425a, 0x5412, 0x1e72,
- 0x1e42, 0x422a, 0x545a, 0x0c12, 0x3072,
- 0x3042, 0x542a, 0x0c5a, 0x1e12, 0x4272,
- 0x4244, 0x0c2c, 0x1e5c, 0x3014, 0x5474,
- 0x5444, 0x1e2c, 0x305c, 0x4214, 0x0c74,
- 0x0c44, 0x302c, 0x425c, 0x5414, 0x1e74,
- 0x1e44, 0x422c, 0x545c, 0x0c14, 0x3074,
- 0x3044, 0x542c, 0x0c5c, 0x1e14, 0x4274,
- 0x4246, 0x0c2e, 0x1e5e, 0x3016, 0x5476,
- 0x5446, 0x1e2e, 0x305e, 0x4216, 0x0c76,
- 0x0c46, 0x302e, 0x425e, 0x5416, 0x1e76,
- 0x1e46, 0x422e, 0x545e, 0x0c16, 0x3076,
- 0x3046, 0x542e, 0x0c5e, 0x1e16, 0x4276,
- 0x443c, 0x0e24, 0x2054, 0x320c, 0x566c,
- 0x563c, 0x2024, 0x3254, 0x440c, 0x0e6c,
- 0x0e3c, 0x3224, 0x4454, 0x560c, 0x206c,
- 0x203c, 0x4424, 0x5654, 0x0e0c, 0x326c,
- 0x323c, 0x5624, 0x0e54, 0x200c, 0x446c,
- 0x443e, 0x0e26, 0x2056, 0x320e, 0x566e,
- 0x563e, 0x2026, 0x3256, 0x440e, 0x0e6e,
- 0x0e3e, 0x3226, 0x4456, 0x560e, 0x206e,
- 0x203e, 0x4426, 0x5656, 0x0e0e, 0x326e,
- 0x323e, 0x5626, 0x0e56, 0x200e, 0x446e,
- 0x4440, 0x0e28, 0x2058, 0x3210, 0x5670,
- 0x5640, 0x2028, 0x3258, 0x4410, 0x0e70,
- 0x0e40, 0x3228, 0x4458, 0x5610, 0x2070,
- 0x2040, 0x4428, 0x5658, 0x0e10, 0x3270,
- 0x3240, 0x5628, 0x0e58, 0x2010, 0x4470,
- 0x4442, 0x0e2a, 0x205a, 0x3212, 0x5672,
- 0x5642, 0x202a, 0x325a, 0x4412, 0x0e72,
- 0x0e42, 0x322a, 0x445a, 0x5612, 0x2072,
- 0x2042, 0x442a, 0x565a, 0x0e12, 0x3272,
- 0x3242, 0x562a, 0x0e5a, 0x2012, 0x4472,
- 0x4444, 0x0e2c, 0x205c, 0x3214, 0x5674,
- 0x5644, 0x202c, 0x325c, 0x4414, 0x0e74,
- 0x0e44, 0x322c, 0x445c, 0x5614, 0x2074,
- 0x2044, 0x442c, 0x565c, 0x0e14, 0x3274,
- 0x3244, 0x562c, 0x0e5c, 0x2014, 0x4474,
- 0x4446, 0x0e2e, 0x205e, 0x3216, 0x5676,
- 0x5646, 0x202e, 0x325e, 0x4416, 0x0e76,
- 0x0e46, 0x322e, 0x445e, 0x5616, 0x2076,
- 0x2046, 0x442e, 0x565e, 0x0e16, 0x3276,
- 0x3246, 0x562e, 0x0e5e, 0x2016, 0x4476,
- 0x463c, 0x1024, 0x2254, 0x340c, 0x586c,
- 0x583c, 0x2224, 0x3454, 0x460c, 0x106c,
- 0x103c, 0x3424, 0x4654, 0x580c, 0x226c,
- 0x223c, 0x4624, 0x5854, 0x100c, 0x346c,
- 0x343c, 0x5824, 0x1054, 0x220c, 0x466c,
- 0x463e, 0x1026, 0x2256, 0x340e, 0x586e,
- 0x583e, 0x2226, 0x3456, 0x460e, 0x106e,
- 0x103e, 0x3426, 0x4656, 0x580e, 0x226e,
- 0x223e, 0x4626, 0x5856, 0x100e, 0x346e,
- 0x343e, 0x5826, 0x1056, 0x220e, 0x466e,
- 0x4640, 0x1028, 0x2258, 0x3410, 0x5870,
- 0x5840, 0x2228, 0x3458, 0x4610, 0x1070,
- 0x1040, 0x3428, 0x4658, 0x5810, 0x2270,
- 0x2240, 0x4628, 0x5858, 0x1010, 0x3470,
- 0x3440, 0x5828, 0x1058, 0x2210, 0x4670,
- 0x4642, 0x102a, 0x225a, 0x3412, 0x5872,
- 0x5842, 0x222a, 0x345a, 0x4612, 0x1072,
- 0x1042, 0x342a, 0x465a, 0x5812, 0x2272,
- 0x2242, 0x462a, 0x585a, 0x1012, 0x3472,
- 0x3442, 0x582a, 0x105a, 0x2212, 0x4672,
- 0x4644, 0x102c, 0x225c, 0x3414, 0x5874,
- 0x5844, 0x222c, 0x345c, 0x4614, 0x1074,
- 0x1044, 0x342c, 0x465c, 0x5814, 0x2274,
- 0x2244, 0x462c, 0x585c, 0x1014, 0x3474,
- 0x3444, 0x582c, 0x105c, 0x2214, 0x4674,
- 0x4646, 0x102e, 0x225e, 0x3416, 0x5876,
- 0x5846, 0x222e, 0x345e, 0x4616, 0x1076,
- 0x1046, 0x342e, 0x465e, 0x5816, 0x2276,
- 0x2246, 0x462e, 0x585e, 0x1016, 0x3476,
- 0x3446, 0x582e, 0x105e, 0x2216, 0x4676,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-};
-
-
 /* DV25/50 DCT coefficient weights and inverse weights */
 /* created by dvtables.py */
 static const int dv_weight_bits = 18;
@@ -6175,6 +470,10 @@ static DVwork_chunk work_chunks_dv100ntscp[2*10*27];
 static DVwork_chunk work_chunks_dv100pali [4*12*27];
 static DVwork_chunk work_chunks_dv100ntsci[4*10*27];
 
+static uint32_t dv_idct_factor_sd    [2*2*22*64];
+static uint32_t dv_idct_factor_hd1080[2*4*16*64];
+static uint32_t dv_idct_factor_hd720 [2*4*16*64];
+
 static const DVprofile dv_profiles[] = {
     { .dsf = 0,
       .video_stype = 0x0,
@@ -6186,8 +485,8 @@ static const DVprofile dv_profiles[] = {
       .height = 480,
       .width = 720,
       .sar = {{10, 11}, {40, 33}},
-      .video_place = dv_place_411,
       .work_chunks = &work_chunks_dv25ntsc[0],
+      .idct_factor = &dv_idct_factor_sd[0],
       .pix_fmt = PIX_FMT_YUV411P,
       .bpm = 6,
       .block_sizes = block_sizes_dv2550,
@@ -6206,8 +505,8 @@ static const DVprofile dv_profiles[] = {
       .height = 576,
       .width = 720,
       .sar = {{59, 54}, {118, 81}},
-      .video_place = dv_place_420,
       .work_chunks = &work_chunks_dv25pal[0],
+      .idct_factor = &dv_idct_factor_sd[0],
       .pix_fmt = PIX_FMT_YUV420P,
       .bpm = 6,
       .block_sizes = block_sizes_dv2550,
@@ -6226,8 +525,8 @@ static const DVprofile dv_profiles[] = {
       .height = 576,
       .width = 720,
       .sar = {{59, 54}, {118, 81}},
-      .video_place = dv_place_411P,
       .work_chunks = &work_chunks_dv25pal411[0],
+      .idct_factor = &dv_idct_factor_sd[0],
       .pix_fmt = PIX_FMT_YUV411P,
       .bpm = 6,
       .block_sizes = block_sizes_dv2550,
@@ -6246,8 +545,8 @@ static const DVprofile dv_profiles[] = {
       .height = 480,
       .width = 720,
       .sar = {{10, 11}, {40, 33}},
-      .video_place = dv_place_422_525,
       .work_chunks = &work_chunks_dv50ntsc[0],
+      .idct_factor = &dv_idct_factor_sd[0],
       .pix_fmt = PIX_FMT_YUV422P,
       .bpm = 6,
       .block_sizes = block_sizes_dv2550,
@@ -6266,8 +565,8 @@ static const DVprofile dv_profiles[] = {
       .height = 576,
       .width = 720,
       .sar = {{59, 54}, {118, 81}},
-      .video_place = dv_place_422_625,
       .work_chunks = &work_chunks_dv50pal[0],
+      .idct_factor = &dv_idct_factor_sd[0],
       .pix_fmt = PIX_FMT_YUV422P,
       .bpm = 6,
       .block_sizes = block_sizes_dv2550,
@@ -6286,8 +585,8 @@ static const DVprofile dv_profiles[] = {
       .height = 1080,
       .width = 1280,
       .sar = {{1, 1}, {1, 1}},
-      .video_place = dv_place_1080i60,
       .work_chunks = &work_chunks_dv100ntsci[0],
+      .idct_factor = &dv_idct_factor_hd1080[0],
       .pix_fmt = PIX_FMT_YUV422P,
       .bpm = 8,
       .block_sizes = block_sizes_dv100,
@@ -6306,8 +605,8 @@ static const DVprofile dv_profiles[] = {
       .height = 1080,
       .width = 1440,
       .sar = {{1, 1}, {1, 1}},
-      .video_place = dv_place_1080i50,
       .work_chunks = &work_chunks_dv100pali[0],
+      .idct_factor = &dv_idct_factor_hd1080[0],
       .pix_fmt = PIX_FMT_YUV422P,
       .bpm = 8,
       .block_sizes = block_sizes_dv100,
@@ -6326,8 +625,8 @@ static const DVprofile dv_profiles[] = {
       .height = 720,
       .width = 960,
       .sar = {{1, 1}, {1, 1}},
-      .video_place = dv_place_720p60,
       .work_chunks = &work_chunks_dv100ntscp[0],
+      .idct_factor = &dv_idct_factor_hd720[0],
       .pix_fmt = PIX_FMT_YUV422P,
       .bpm = 8,
       .block_sizes = block_sizes_dv100,
@@ -6346,8 +645,8 @@ static const DVprofile dv_profiles[] = {
       .height = 720,
       .width = 960,
       .sar = {{1, 1}, {1, 1}},
-      .video_place = dv_place_720p50,
       .work_chunks = &work_chunks_dv100palp[0],
+      .idct_factor = &dv_idct_factor_hd720[0],
       .pix_fmt = PIX_FMT_YUV422P,
       .bpm = 8,
       .block_sizes = block_sizes_dv100,
diff --git a/libavcodec/dvdsubdec.c b/libavcodec/dvdsubdec.c
index f95c329..dac152b 100644
--- a/libavcodec/dvdsubdec.c
+++ b/libavcodec/dvdsubdec.c
@@ -319,17 +319,19 @@ static int decode_dvd_subtitles(AVSubtitle *sub_header,
             if (w > 0 && h > 0) {
                 if (sub_header->rects != NULL) {
                     for (i = 0; i < sub_header->num_rects; i++) {
-                        av_free(sub_header->rects[i].bitmap);
-                        av_free(sub_header->rects[i].rgba_palette);
+                        av_freep(&sub_header->rects[i]->pict.data[0]);
+                        av_freep(&sub_header->rects[i]->pict.data[1]);
+                        av_freep(&sub_header->rects[i]);
                     }
                     av_freep(&sub_header->rects);
                     sub_header->num_rects = 0;
                 }
 
                 bitmap = av_malloc(w * h);
-                sub_header->rects = av_mallocz(sizeof(AVSubtitleRect));
+                sub_header->rects = av_mallocz(sizeof(*sub_header->rects));
+                sub_header->rects[0] = av_mallocz(sizeof(AVSubtitleRect));
                 sub_header->num_rects = 1;
-                sub_header->rects[0].bitmap = bitmap;
+                sub_header->rects[0]->pict.data[0] = bitmap;
                 decode_rle(bitmap, w * 2, w, (h + 1) / 2,
                            buf, offset1, buf_size, is_8bit);
                 decode_rle(bitmap + w, w * 2, w, h / 2,
@@ -337,20 +339,20 @@ static int decode_dvd_subtitles(AVSubtitle *sub_header,
                 if (is_8bit) {
                     if (yuv_palette == 0)
                         goto fail;
-                    sub_header->rects[0].rgba_palette = av_malloc(256 * 4);
-                    sub_header->rects[0].nb_colors = 256;
-                    yuv_a_to_rgba(yuv_palette, alpha, sub_header->rects[0].rgba_palette, 256);
+                    sub_header->rects[0]->pict.data[1] = av_malloc(256 * 4);
+                    sub_header->rects[0]->nb_colors = 256;
+                    yuv_a_to_rgba(yuv_palette, alpha, (uint32_t*)sub_header->rects[0]->pict.data[1], 256);
                 } else {
-                    sub_header->rects[0].rgba_palette = av_malloc(4 * 4);
-                    sub_header->rects[0].nb_colors = 4;
-                    guess_palette(sub_header->rects[0].rgba_palette,
+                    sub_header->rects[0]->pict.data[1] = av_malloc(4 * 4);
+                    sub_header->rects[0]->nb_colors = 4;
+                    guess_palette((uint32_t*)sub_header->rects[0]->pict.data[1],
                                   colormap, alpha, 0xffff00);
                 }
-                sub_header->rects[0].x = x1;
-                sub_header->rects[0].y = y1;
-                sub_header->rects[0].w = w;
-                sub_header->rects[0].h = h;
-                sub_header->rects[0].linesize = w;
+                sub_header->rects[0]->x = x1;
+                sub_header->rects[0]->y = y1;
+                sub_header->rects[0]->w = w;
+                sub_header->rects[0]->h = h;
+                sub_header->rects[0]->pict.linesize[0] = w;
             }
         }
         if (next_cmd_pos == cmd_pos)
@@ -362,8 +364,9 @@ static int decode_dvd_subtitles(AVSubtitle *sub_header,
  fail:
     if (sub_header->rects != NULL) {
         for (i = 0; i < sub_header->num_rects; i++) {
-            av_free(sub_header->rects[i].bitmap);
-            av_free(sub_header->rects[i].rgba_palette);
+            av_freep(&sub_header->rects[i]->pict.data[0]);
+            av_freep(&sub_header->rects[i]->pict.data[1]);
+            av_freep(&sub_header->rects[i]);
         }
         av_freep(&sub_header->rects);
         sub_header->num_rects = 0;
@@ -390,34 +393,34 @@ static int find_smallest_bounding_rectangle(AVSubtitle *s)
     int y1, y2, x1, x2, y, w, h, i;
     uint8_t *bitmap;
 
-    if (s->num_rects == 0 || s->rects == NULL || s->rects[0].w <= 0 || s->rects[0].h <= 0)
+    if (s->num_rects == 0 || s->rects == NULL || s->rects[0]->w <= 0 || s->rects[0]->h <= 0)
         return 0;
 
     memset(transp_color, 0, 256);
-    for(i = 0; i < s->rects[0].nb_colors; i++) {
-        if ((s->rects[0].rgba_palette[i] >> 24) == 0)
+    for(i = 0; i < s->rects[0]->nb_colors; i++) {
+        if ((((uint32_t*)s->rects[0]->pict.data[1])[i] >> 24) == 0)
             transp_color[i] = 1;
     }
     y1 = 0;
-    while (y1 < s->rects[0].h && is_transp(s->rects[0].bitmap + y1 * s->rects[0].linesize,
-                                  1, s->rects[0].w, transp_color))
+    while (y1 < s->rects[0]->h && is_transp(s->rects[0]->pict.data[0] + y1 * s->rects[0]->pict.linesize[0],
+                                  1, s->rects[0]->w, transp_color))
         y1++;
-    if (y1 == s->rects[0].h) {
-        av_freep(&s->rects[0].bitmap);
-        s->rects[0].w = s->rects[0].h = 0;
+    if (y1 == s->rects[0]->h) {
+        av_freep(&s->rects[0]->pict.data[0]);
+        s->rects[0]->w = s->rects[0]->h = 0;
         return 0;
     }
 
-    y2 = s->rects[0].h - 1;
-    while (y2 > 0 && is_transp(s->rects[0].bitmap + y2 * s->rects[0].linesize, 1,
-                               s->rects[0].w, transp_color))
+    y2 = s->rects[0]->h - 1;
+    while (y2 > 0 && is_transp(s->rects[0]->pict.data[0] + y2 * s->rects[0]->pict.linesize[0], 1,
+                               s->rects[0]->w, transp_color))
         y2--;
     x1 = 0;
-    while (x1 < (s->rects[0].w - 1) && is_transp(s->rects[0].bitmap + x1, s->rects[0].linesize,
-                                        s->rects[0].h, transp_color))
+    while (x1 < (s->rects[0]->w - 1) && is_transp(s->rects[0]->pict.data[0] + x1, s->rects[0]->pict.linesize[0],
+                                        s->rects[0]->h, transp_color))
         x1++;
-    x2 = s->rects[0].w - 1;
-    while (x2 > 0 && is_transp(s->rects[0].bitmap + x2, s->rects[0].linesize, s->rects[0].h,
+    x2 = s->rects[0]->w - 1;
+    while (x2 > 0 && is_transp(s->rects[0]->pict.data[0] + x2, s->rects[0]->pict.linesize[0], s->rects[0]->h,
                                   transp_color))
         x2--;
     w = x2 - x1 + 1;
@@ -426,15 +429,15 @@ static int find_smallest_bounding_rectangle(AVSubtitle *s)
     if (!bitmap)
         return 1;
     for(y = 0; y < h; y++) {
-        memcpy(bitmap + w * y, s->rects[0].bitmap + x1 + (y1 + y) * s->rects[0].linesize, w);
+        memcpy(bitmap + w * y, s->rects[0]->pict.data[0] + x1 + (y1 + y) * s->rects[0]->pict.linesize[0], w);
     }
-    av_freep(&s->rects[0].bitmap);
-    s->rects[0].bitmap = bitmap;
-    s->rects[0].linesize = w;
-    s->rects[0].w = w;
-    s->rects[0].h = h;
-    s->rects[0].x += x1;
-    s->rects[0].y += y1;
+    av_freep(&s->rects[0]->pict.data[0]);
+    s->rects[0]->pict.data[0] = bitmap;
+    s->rects[0]->pict.linesize[0] = w;
+    s->rects[0]->w = w;
+    s->rects[0]->h = h;
+    s->rects[0]->x += x1;
+    s->rects[0]->y += y1;
     return 1;
 }
 
@@ -491,8 +494,8 @@ static int dvdsub_decode(AVCodecContext *avctx,
     av_log(NULL, AV_LOG_INFO, "start=%d ms end =%d ms\n",
            sub->start_display_time,
            sub->end_display_time);
-    ppm_save("/tmp/a.ppm", sub->rects[0].bitmap,
-             sub->rects[0].w, sub->rects[0].h, sub->rects[0].rgba_palette);
+    ppm_save("/tmp/a.ppm", sub->rects[0]->pict.data[0],
+             sub->rects[0]->w, sub->rects[0]->h, sub->rects[0]->pict.data[1]);
 #endif
 
     *data_size = 1;
diff --git a/libavcodec/dvdsubenc.c b/libavcodec/dvdsubenc.c
index a5380ce..5f6bc21 100644
--- a/libavcodec/dvdsubenc.c
+++ b/libavcodec/dvdsubenc.c
@@ -108,10 +108,10 @@ static int encode_dvd_subtitles(uint8_t *outbuf, int outbuf_size,
         cmap[i] = 0;
     }
     for (object_id = 0; object_id < rects; object_id++)
-        for (i=0; i<h->rects[object_id].w*h->rects[object_id].h; ++i) {
-            color = h->rects[object_id].bitmap[i];
+        for (i=0; i<h->rects[object_id]->w*h->rects[object_id]->h; ++i) {
+            color = h->rects[object_id]->pict.data[0][i];
             // only count non-transparent pixels
-            alpha = h->rects[object_id].rgba_palette[color] >> 24;
+            alpha = ((uint32_t*)h->rects[object_id]->pict.data[1])[color] >> 24;
             hist[color] += alpha;
         }
     for (color=3;; --color) {
@@ -138,19 +138,19 @@ static int encode_dvd_subtitles(uint8_t *outbuf, int outbuf_size,
     for (object_id = 0; object_id < rects; object_id++) {
         offset1[object_id] = q - outbuf;
         // worst case memory requirement: 1 nibble per pixel..
-        if ((q - outbuf) + h->rects[object_id].w*h->rects[object_id].h/2
+        if ((q - outbuf) + h->rects[object_id]->w*h->rects[object_id]->h/2
             + 17*rects + 21 > outbuf_size) {
             av_log(NULL, AV_LOG_ERROR, "dvd_subtitle too big\n");
             return -1;
         }
-        dvd_encode_rle(&q, h->rects[object_id].bitmap,
-                       h->rects[object_id].w*2,
-                       h->rects[object_id].w, h->rects[object_id].h >> 1,
+        dvd_encode_rle(&q, h->rects[object_id]->pict.data[0],
+                       h->rects[object_id]->w*2,
+                       h->rects[object_id]->w, h->rects[object_id]->h >> 1,
                        cmap);
         offset2[object_id] = q - outbuf;
-        dvd_encode_rle(&q, h->rects[object_id].bitmap + h->rects[object_id].w,
-                       h->rects[object_id].w*2,
-                       h->rects[object_id].w, h->rects[object_id].h >> 1,
+        dvd_encode_rle(&q, h->rects[object_id]->pict.data[0] + h->rects[object_id]->w,
+                       h->rects[object_id]->w*2,
+                       h->rects[object_id]->w, h->rects[object_id]->h >> 1,
                        cmap);
     }
 
@@ -170,17 +170,17 @@ static int encode_dvd_subtitles(uint8_t *outbuf, int outbuf_size,
     // XXX not sure if more than one rect can really be encoded..
     // 12 bytes per rect
     for (object_id = 0; object_id < rects; object_id++) {
-        int x2 = h->rects[object_id].x + h->rects[object_id].w - 1;
-        int y2 = h->rects[object_id].y + h->rects[object_id].h - 1;
+        int x2 = h->rects[object_id]->x + h->rects[object_id]->w - 1;
+        int y2 = h->rects[object_id]->y + h->rects[object_id]->h - 1;
 
         *q++ = 0x05;
         // x1 x2 -> 6 nibbles
-        *q++ = h->rects[object_id].x >> 4;
-        *q++ = (h->rects[object_id].x << 4) | ((x2 >> 8) & 0xf);
+        *q++ = h->rects[object_id]->x >> 4;
+        *q++ = (h->rects[object_id]->x << 4) | ((x2 >> 8) & 0xf);
         *q++ = x2;
         // y1 y2 -> 6 nibbles
-        *q++ = h->rects[object_id].y >> 4;
-        *q++ = (h->rects[object_id].y << 4) | ((y2 >> 8) & 0xf);
+        *q++ = h->rects[object_id]->y >> 4;
+        *q++ = (h->rects[object_id]->y << 4) | ((y2 >> 8) & 0xf);
         *q++ = y2;
 
         *q++ = 0x06;
diff --git a/libavcodec/eac3dec.c b/libavcodec/eac3dec.c
index 0b10b41..f57c1cc 100644
--- a/libavcodec/eac3dec.c
+++ b/libavcodec/eac3dec.c
@@ -21,6 +21,8 @@
  */
 
 #include "avcodec.h"
+#include "internal.h"
+#include "aac_ac3_parser.h"
 #include "ac3.h"
 #include "ac3_parser.h"
 #include "ac3dec.h"
@@ -182,11 +184,11 @@ int ff_eac3_parse_header(AC3DecodeContext *s)
        application can select from. each independent stream can also contain
        dependent streams which are used to add or replace channels. */
     if (s->frame_type == EAC3_FRAME_TYPE_DEPENDENT) {
-        av_log_missing_feature(s->avctx, "Dependent substream decoding", 1);
-        return AC3_PARSE_ERROR_FRAME_TYPE;
+        ff_log_missing_feature(s->avctx, "Dependent substream decoding", 1);
+        return AAC_AC3_PARSE_ERROR_FRAME_TYPE;
     } else if (s->frame_type == EAC3_FRAME_TYPE_RESERVED) {
         av_log(s->avctx, AV_LOG_ERROR, "Reserved frame type\n");
-        return AC3_PARSE_ERROR_FRAME_TYPE;
+        return AAC_AC3_PARSE_ERROR_FRAME_TYPE;
     }
 
     /* The substream id indicates which substream this frame belongs to. each
@@ -194,8 +196,8 @@ int ff_eac3_parse_header(AC3DecodeContext *s)
        associated to an independent stream have matching substream id's. */
     if (s->substreamid) {
         /* only decode substream with id=0. skip any additional substreams. */
-        av_log_missing_feature(s->avctx, "Additional substreams", 1);
-        return AC3_PARSE_ERROR_FRAME_TYPE;
+        ff_log_missing_feature(s->avctx, "Additional substreams", 1);
+        return AAC_AC3_PARSE_ERROR_FRAME_TYPE;
     }
 
     if (s->bit_alloc_params.sr_code == EAC3_SR_CODE_REDUCED) {
@@ -203,7 +205,7 @@ int ff_eac3_parse_header(AC3DecodeContext *s)
            rates in bit allocation.  The best assumption would be that it is
            handled like AC-3 DolbyNet, but we cannot be sure until we have a
            sample which utilizes this feature. */
-        av_log_missing_feature(s->avctx, "Reduced sampling rates", 1);
+        ff_log_missing_feature(s->avctx, "Reduced sampling rates", 1);
         return -1;
     }
     skip_bits(gbc, 5); // skip bitstream id
@@ -460,7 +462,7 @@ int ff_eac3_parse_header(AC3DecodeContext *s)
 
     /* spectral extension attenuation data */
     if (parse_spx_atten_data) {
-        av_log_missing_feature(s->avctx, "Spectral extension attenuation", 1);
+        ff_log_missing_feature(s->avctx, "Spectral extension attenuation", 1);
         for (ch = 1; ch <= s->fbw_channels; ch++) {
             if (get_bits1(gbc)) { // channel has spx attenuation
                 skip_bits(gbc, 5); // skip spx attenuation code
@@ -475,7 +477,8 @@ int ff_eac3_parse_header(AC3DecodeContext *s)
            The spec does not say what this data is or what it's used for.
            It is likely the offset of each block within the frame. */
         int block_start_bits = (s->num_blocks-1) * (4 + av_log2(s->frame_size-2));
-        skip_bits(gbc, block_start_bits);
+        skip_bits_long(gbc, block_start_bits);
+        ff_log_missing_feature(s->avctx, "Block start info", 1);
     }
 
     /* syntax state initialization */
diff --git a/libavcodec/eatgq.c b/libavcodec/eatgq.c
index b65c8a5..fa8faf4 100644
--- a/libavcodec/eatgq.c
+++ b/libavcodec/eatgq.c
@@ -33,7 +33,7 @@
 #include "bitstream.h"
 #include "bytestream.h"
 #include "dsputil.h"
-extern const uint16_t ff_inv_aanscales[64]; //mpegvideo_enc.c
+#include "aandcttab.h"
 
 typedef struct TgqContext {
     AVCodecContext *avctx;
diff --git a/libavcodec/error_resilience.c b/libavcodec/error_resilience.c
index f15b0b6..bdd3a66 100644
--- a/libavcodec/error_resilience.c
+++ b/libavcodec/error_resilience.c
@@ -563,7 +563,7 @@ static int is_intra_more_likely(MpegEncContext *s){
 
     if(undamaged_count < 5) return 0; //almost all MBs damaged -> use temporal prediction
 
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
     //prevent dsp.sad() check, that requires access to the image
     if(s->avctx->xvmc_acceleration && s->pict_type==FF_I_TYPE) return 1;
 #endif
@@ -681,6 +681,7 @@ void ff_er_frame_end(MpegEncContext *s){
     Picture *pic= s->current_picture_ptr;
 
     if(!s->error_recognition || s->error_count==0 || s->avctx->lowres ||
+       s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU ||
        s->error_count==3*s->mb_width*(s->avctx->skip_top + s->avctx->skip_bottom)) return;
 
     if(s->current_picture.motion_val[0] == NULL){
@@ -934,7 +935,7 @@ void ff_er_frame_end(MpegEncContext *s){
     }else
         guess_mv(s);
 
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
     /* the filters below are not XvMC compatible, skip them */
     if(s->avctx->xvmc_acceleration) goto ec_clean;
 #endif
@@ -1023,7 +1024,7 @@ void ff_er_frame_end(MpegEncContext *s){
         v_block_filter(s, s->current_picture.data[2], s->mb_width  , s->mb_height  , s->uvlinesize, 0);
     }
 
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
 ec_clean:
 #endif
     /* clean a few tables */
diff --git a/libavcodec/eval.c b/libavcodec/eval.c
index be393a6..fce6da5 100644
--- a/libavcodec/eval.c
+++ b/libavcodec/eval.c
@@ -47,12 +47,12 @@
 typedef struct Parser{
     int stack_index;
     char *s;
-    double *const_value;
-    const char **const_name;          // NULL terminated
+    const double *const_value;
+    const char * const *const_name;          // NULL terminated
     double (**func1)(void *, double a); // NULL terminated
     const char **func1_name;          // NULL terminated
     double (**func2)(void *, double a, double b); // NULL terminated
-    char **func2_name;          // NULL terminated
+    const char **func2_name;          // NULL terminated
     void *opaque;
     const char **error;
 #define VARS 10
@@ -375,9 +375,9 @@ static int verify_expr(AVEvalExpr * e) {
     }
 }
 
-AVEvalExpr * ff_parse(const char *s, const char **const_name,
+AVEvalExpr * ff_parse(const char *s, const char * const *const_name,
                double (**func1)(void *, double), const char **func1_name,
-               double (**func2)(void *, double, double), char **func2_name,
+               double (**func2)(void *, double, double), const char **func2_name,
                const char **error){
     Parser p;
     AVEvalExpr * e;
@@ -404,7 +404,7 @@ AVEvalExpr * ff_parse(const char *s, const char **const_name,
     return e;
 }
 
-double ff_parse_eval(AVEvalExpr * e, double *const_value, void *opaque) {
+double ff_parse_eval(AVEvalExpr * e, const double *const_value, void *opaque) {
     Parser p;
 
     p.const_value= const_value;
@@ -412,9 +412,9 @@ double ff_parse_eval(AVEvalExpr * e, double *const_value, void *opaque) {
     return eval_expr(&p, e);
 }
 
-double ff_eval2(const char *s, double *const_value, const char **const_name,
+double ff_eval2(const char *s, const double *const_value, const char * const *const_name,
                double (**func1)(void *, double), const char **func1_name,
-               double (**func2)(void *, double, double), char **func2_name,
+               double (**func2)(void *, double, double), const char **func2_name,
                void *opaque, const char **error){
     AVEvalExpr * e = ff_parse(s, const_name, func1, func1_name, func2, func2_name, error);
     double d;
diff --git a/libavcodec/eval.h b/libavcodec/eval.h
index 0918fc9..c450332 100644
--- a/libavcodec/eval.h
+++ b/libavcodec/eval.h
@@ -42,9 +42,9 @@
  * @param opaque a pointer which will be passed to all functions from func1 and func2
  * @return the value of the expression
  */
-double ff_eval2(const char *s, double *const_value, const char **const_name,
+double ff_eval2(const char *s, const double *const_value, const char * const *const_name,
                double (**func1)(void *, double), const char **func1_name,
-               double (**func2)(void *, double, double), char **func2_name,
+               double (**func2)(void *, double, double), const char **func2_name,
                void *opaque, const char **error);
 
 typedef struct ff_expr_s AVEvalExpr;
@@ -61,9 +61,9 @@ typedef struct ff_expr_s AVEvalExpr;
  * @return AVEvalExpr which must be freed with ff_eval_free by the user when it is not needed anymore
  *         NULL if anything went wrong
  */
-AVEvalExpr * ff_parse(const char *s, const char **const_name,
+AVEvalExpr * ff_parse(const char *s, const char * const *const_name,
                double (**func1)(void *, double), const char **func1_name,
-               double (**func2)(void *, double, double), char **func2_name,
+               double (**func2)(void *, double, double), const char **func2_name,
                const char **error);
 /**
  * Evaluates a previously parsed expression.
@@ -71,7 +71,7 @@ AVEvalExpr * ff_parse(const char *s, const char **const_name,
  * @param opaque a pointer which will be passed to all functions from func1 and func2
  * @return the value of the expression
  */
-double ff_parse_eval(AVEvalExpr * e, double *const_value, void *opaque);
+double ff_parse_eval(AVEvalExpr * e, const double *const_value, void *opaque);
 void ff_eval_free(AVEvalExpr * e);
 
 #endif /* AVCODEC_EVAL_H */
diff --git a/libavcodec/faxcompr.c b/libavcodec/faxcompr.c
new file mode 100644
index 0000000..e7f7706
--- /dev/null
+++ b/libavcodec/faxcompr.c
@@ -0,0 +1,313 @@
+/*
+ * CCITT Fax Group 3 and 4 decompression
+ * Copyright (c) 2008 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * CCITT Fax Group 3 and 4 decompression
+ * @file faxcompr.c
+ * @author Konstantin Shishkov
+ */
+#include "avcodec.h"
+#include "bitstream.h"
+#include "faxcompr.h"
+
+#define CCITT_SYMS 104
+
+static const uint16_t ccitt_syms[CCITT_SYMS] = {
+    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,   12,
+   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,
+   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+   52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,
+  128,  192,  256,  320,  384,  448,  512,  576,  640,  704,  768,  832,  896,
+  960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728,
+ 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560
+};
+
+static const uint8_t ccitt_codes_bits[2][CCITT_SYMS] =
+{
+  {
+    0x35, 0x07, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, 0x13, 0x14, 0x07, 0x08, 0x08,
+    0x03, 0x34, 0x35, 0x2A, 0x2B, 0x27, 0x0C, 0x08, 0x17, 0x03, 0x04, 0x28, 0x2B,
+    0x13, 0x24, 0x18, 0x02, 0x03, 0x1A, 0x1B, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+    0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x04, 0x05, 0x0A, 0x0B, 0x52, 0x53, 0x54,
+    0x55, 0x24, 0x25, 0x58, 0x59, 0x5A, 0x5B, 0x4A, 0x4B, 0x32, 0x33, 0x34, 0x1B,
+    0x12, 0x17, 0x37, 0x36, 0x37, 0x64, 0x65, 0x68, 0x67, 0xCC, 0xCD, 0xD2, 0xD3,
+    0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0x98, 0x99, 0x9A, 0x18, 0x9B,
+    0x08, 0x0C, 0x0D, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F
+  },
+  {
+    0x37, 0x02, 0x03, 0x02, 0x03, 0x03, 0x02, 0x03, 0x05, 0x04, 0x04, 0x05, 0x07,
+    0x04, 0x07, 0x18, 0x17, 0x18, 0x08, 0x67, 0x68, 0x6C, 0x37, 0x28, 0x17, 0x18,
+    0xCA, 0xCB, 0xCC, 0xCD, 0x68, 0x69, 0x6A, 0x6B, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
+    0xD7, 0x6C, 0x6D, 0xDA, 0xDB, 0x54, 0x55, 0x56, 0x57, 0x64, 0x65, 0x52, 0x53,
+    0x24, 0x37, 0x38, 0x27, 0x28, 0x58, 0x59, 0x2B, 0x2C, 0x5A, 0x66, 0x67, 0x0F,
+    0xC8, 0xC9, 0x5B, 0x33, 0x34, 0x35, 0x6C, 0x6D, 0x4A, 0x4B, 0x4C, 0x4D, 0x72,
+    0x73, 0x74, 0x75, 0x76, 0x77, 0x52, 0x53, 0x54, 0x55, 0x5A, 0x5B, 0x64, 0x65,
+    0x08, 0x0C, 0x0D, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F
+  }
+};
+
+static const uint8_t ccitt_codes_lens[2][CCITT_SYMS] =
+{
+  {
+     8,  6,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  7,  7,
+     7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+     8,  8,  8,  8,  5,  5,  6,  7,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,
+     9,  9,  9,  9,  9,  9,  9,  9,  9,  6,  9, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12
+  },
+  {
+    10,  3,  2,  2,  3,  4,  4,  5,  6,  6,  7,  7,  7,  8,  8,  9, 10, 10, 10, 11,
+    11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 10, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12
+  }
+};
+
+static const uint8_t ccitt_group3_2d_bits[11] = {
+    1, 1, 2, 2, 2, 1, 3, 3, 3, 1, 1
+};
+
+static const uint8_t ccitt_group3_2d_lens[11] = {
+    4, 3, 7, 6, 3, 1, 3, 6, 7, 7, 9
+};
+
+static VLC ccitt_vlc[2], ccitt_group3_2d_vlc;
+
+av_cold void ff_ccitt_unpack_init()
+{
+    static VLC_TYPE code_table1[528][2];
+    static VLC_TYPE code_table2[648][2];
+    int i;
+    static int initialized = 0;
+
+    if(initialized)
+        return;
+    ccitt_vlc[0].table = code_table1;
+    ccitt_vlc[0].table_allocated = 528;
+    ccitt_vlc[1].table = code_table2;
+    ccitt_vlc[1].table_allocated = 648;
+    for(i = 0; i < 2; i++){
+        init_vlc_sparse(&ccitt_vlc[i], 9, CCITT_SYMS,
+                        ccitt_codes_lens[i], 1, 1,
+                        ccitt_codes_bits[i], 1, 1,
+                        ccitt_syms, 2, 2,
+                        INIT_VLC_USE_NEW_STATIC);
+    }
+    INIT_VLC_STATIC(&ccitt_group3_2d_vlc, 9, 11,
+                    ccitt_group3_2d_lens, 1, 1,
+                    ccitt_group3_2d_bits, 1, 1, 512);
+    initialized = 1;
+}
+
+
+static int decode_group3_1d_line(AVCodecContext *avctx, GetBitContext *gb,
+                                 unsigned int pix_left, int *runs, const int *runend)
+{
+    int mode = 0;
+    unsigned int run=0;
+    unsigned int t;
+    for(;;){
+        t = get_vlc2(gb, ccitt_vlc[mode].table, 9, 2);
+        run += t;
+        if(t < 64){
+            *runs++ = run;
+            if(runs >= runend){
+                av_log(avctx, AV_LOG_ERROR, "Run overrun\n");
+                return -1;
+            }
+            if(pix_left <= run){
+                if(pix_left == run)
+                    break;
+                av_log(avctx, AV_LOG_ERROR, "Run went out of bounds\n");
+                return -1;
+            }
+            pix_left -= run;
+            run = 0;
+            mode = !mode;
+        }else if((int)t == -1){
+            av_log(avctx, AV_LOG_ERROR, "Incorrect code\n");
+            return -1;
+        }
+    }
+    *runs++ = 0;
+    return 0;
+}
+
+static int decode_group3_2d_line(AVCodecContext *avctx, GetBitContext *gb,
+                                 unsigned int width, int *runs, const int *runend, const int *ref)
+{
+    int mode = 0, saved_run = 0, t;
+    int run_off = *ref++;
+    unsigned int offs=0, run= 0;
+
+    runend--; // for the last written 0
+
+    while(offs < width){
+        int cmode = get_vlc2(gb, ccitt_group3_2d_vlc.table, 9, 1);
+        if(cmode == -1){
+            av_log(avctx, AV_LOG_ERROR, "Incorrect mode VLC\n");
+            return -1;
+        }
+        if(!cmode){//pass mode
+            run_off += *ref++;
+            run = run_off - offs;
+            offs= run_off;
+            run_off += *ref++;
+            if(offs > width){
+                av_log(avctx, AV_LOG_ERROR, "Run went out of bounds\n");
+                return -1;
+            }
+            saved_run += run;
+        }else if(cmode == 1){//horizontal mode
+            int k;
+            for(k = 0; k < 2; k++){
+                run = 0;
+                for(;;){
+                    t = get_vlc2(gb, ccitt_vlc[mode].table, 9, 2);
+                    if(t == -1){
+                        av_log(avctx, AV_LOG_ERROR, "Incorrect code\n");
+                        return -1;
+                    }
+                    run += t;
+                    if(t < 64)
+                        break;
+                }
+                *runs++ = run + saved_run;
+                if(runs >= runend){
+                    av_log(avctx, AV_LOG_ERROR, "Run overrun\n");
+                    return -1;
+                }
+                saved_run = 0;
+                offs += run;
+                if(offs > width || run > width){
+                    av_log(avctx, AV_LOG_ERROR, "Run went out of bounds\n");
+                    return -1;
+                }
+                mode = !mode;
+            }
+        }else if(cmode == 9 || cmode == 10){
+            av_log(avctx, AV_LOG_ERROR, "Special modes are not supported (yet)\n");
+            return -1;
+        }else{//vertical mode
+            run = run_off - offs + (cmode - 5);
+            run_off -= *--ref;
+            offs += run;
+            if(offs > width || run > width){
+                av_log(avctx, AV_LOG_ERROR, "Run went out of bounds\n");
+                return -1;
+            }
+            *runs++ = run + saved_run;
+            if(runs >= runend){
+                av_log(avctx, AV_LOG_ERROR, "Run overrun\n");
+                return -1;
+            }
+            saved_run = 0;
+            mode = !mode;
+        }
+        //sync line pointers
+        while(run_off <= offs){
+            run_off += *ref++;
+            run_off += *ref++;
+        }
+    }
+    *runs++ = saved_run;
+    *runs++ = 0;
+    return 0;
+}
+
+static void put_line(uint8_t *dst, int size, int width, const int *runs)
+{
+    PutBitContext pb;
+    int run, mode = ~0, pix_left = width, run_idx = 0;
+
+    init_put_bits(&pb, dst, size*8);
+    while(pix_left > 0){
+        run = runs[run_idx++];
+        mode = ~mode;
+        pix_left -= run;
+        for(; run > 16; run -= 16)
+            put_sbits(&pb, 16, mode);
+        if(run)
+            put_sbits(&pb, run, mode);
+    }
+}
+
+static int find_group3_syncmarker(GetBitContext *gb, int srcsize)
+{
+    unsigned int state = -1;
+    srcsize -= get_bits_count(gb);
+    while(srcsize-- > 0){
+        state+= state + get_bits1(gb);
+        if((state & 0xFFF) == 1)
+            return 0;
+    }
+    return -1;
+}
+
+int ff_ccitt_unpack(AVCodecContext *avctx,
+                       const uint8_t *src, int srcsize,
+                       uint8_t *dst, int height, int stride, enum TiffCompr compr)
+{
+    int j;
+    GetBitContext gb;
+    int *runs, *ref, *runend;
+    int ret;
+    int runsize= avctx->width + 2;
+
+    runs = av_malloc(runsize * sizeof(runs[0]));
+    ref  = av_malloc(runsize * sizeof(ref[0]));
+    ref[0] = avctx->width;
+    ref[1] = 0;
+    ref[2] = 0;
+    init_get_bits(&gb, src, srcsize*8);
+    for(j = 0; j < height; j++){
+        runend = runs + runsize;
+        if(compr == TIFF_G4){
+            ret = decode_group3_2d_line(avctx, &gb, avctx->width, runs, runend, ref);
+            if(ret < 0){
+                av_free(runs);
+                av_free(ref);
+                return -1;
+            }
+        }else{
+            if(find_group3_syncmarker(&gb, srcsize*8) < 0)
+                break;
+            if(compr==TIFF_CCITT_RLE || get_bits1(&gb))
+                ret = decode_group3_1d_line(avctx, &gb, avctx->width, runs, runend);
+            else
+                ret = decode_group3_2d_line(avctx, &gb, avctx->width, runs, runend, ref);
+        }
+        if(ret < 0){
+            put_line(dst, stride, avctx->width, ref);
+        }else{
+            put_line(dst, stride, avctx->width, runs);
+            FFSWAP(int*, runs, ref);
+        }
+        dst += stride;
+    }
+    av_free(runs);
+    av_free(ref);
+    return 0;
+}
diff --git a/libavcodec/faxcompr.h b/libavcodec/faxcompr.h
new file mode 100644
index 0000000..632744b
--- /dev/null
+++ b/libavcodec/faxcompr.h
@@ -0,0 +1,45 @@
+/*
+ * CCITT Fax Group 3 and 4 decompression
+ * Copyright (c) 2008 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * CCITT Fax Group 3 and 4 decompression
+ * @file faxcompr.h
+ * @author Konstantin Shishkov
+ */
+#ifndef AVCODEC_FAXCOMPR_H
+#define AVCODEC_FAXCOMPR_H
+
+#include "avcodec.h"
+#include "tiff.h"
+
+/**
+ * initialize upacker code
+ */
+void ff_ccitt_unpack_init();
+
+/**
+ * unpack data compressed with CCITT Group 3 1/2-D or Group 4 method
+ */
+int ff_ccitt_unpack(AVCodecContext *avctx,
+                       const uint8_t *src, int srcsize,
+                       uint8_t *dst, int height, int stride, enum TiffCompr compr);
+
+#endif /* AVCODEC_FAXCOMPR_H */
diff --git a/libavcodec/flashsv.c b/libavcodec/flashsv.c
index 3842594..abe1d87 100644
--- a/libavcodec/flashsv.c
+++ b/libavcodec/flashsv.c
@@ -211,7 +211,7 @@ static int flashsv_decode_frame(AVCodecContext *avctx,
                     /* return -1; */
                 }
                 copy_region(s->tmpblock, s->frame.data[0], s->image_height-(hp+hs+1), wp, hs, ws, s->frame.linesize[0]);
-                skip_bits(&gb, 8*size);   /* skip the consumed bits */
+                skip_bits_long(&gb, 8*size);   /* skip the consumed bits */
             }
         }
     }
diff --git a/libavcodec/fraps.c b/libavcodec/fraps.c
index e03c1fa..0c31e03 100644
--- a/libavcodec/fraps.c
+++ b/libavcodec/fraps.c
@@ -148,10 +148,10 @@ static int decode_frame(AVCodecContext *avctx,
     version = header & 0xff;
     header_size = (header & (1<<30))? 8 : 4; /* bit 30 means pad to 8 bytes */
 
-    if (version > 2 && version != 4 && version != 5) {
+    if (version > 5) {
         av_log(avctx, AV_LOG_ERROR,
                "This file is encoded with Fraps version %d. " \
-               "This codec can only decode version 0, 1, 2 and 4.\n", version);
+               "This codec can only decode versions <= 5.\n", version);
         return -1;
     }
 
@@ -288,6 +288,7 @@ static int decode_frame(AVCodecContext *avctx,
             }
         }
         break;
+    case 3:
     case 5:
         /* Virtually the same as version 4, but is for RGB24 */
         avctx->pix_fmt = PIX_FMT_BGR24;
diff --git a/libavcodec/golomb.c b/libavcodec/golomb.c
index 79dc0a7..51e0f9d 100644
--- a/libavcodec/golomb.c
+++ b/libavcodec/golomb.c
@@ -29,7 +29,7 @@
 #include "libavutil/common.h"
 
 const uint8_t ff_golomb_vlc_len[512]={
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+14,13,12,12,11,11,11,11,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
@@ -48,7 +48,7 @@ const uint8_t ff_golomb_vlc_len[512]={
 };
 
 const uint8_t ff_ue_golomb_vlc_code[512]={
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,
+31,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,
  7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
@@ -67,7 +67,7 @@ const uint8_t ff_ue_golomb_vlc_code[512]={
 };
 
 const int8_t ff_se_golomb_vlc_code[512]={
-  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8, -8,  9, -9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15,
+ 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  8, -8,  9, -9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15,
   4,  4,  4,  4, -4, -4, -4, -4,  5,  5,  5,  5, -5, -5, -5, -5,  6,  6,  6,  6, -6, -6, -6, -6,  7,  7,  7,  7, -7, -7, -7, -7,
   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
diff --git a/libavcodec/golomb.h b/libavcodec/golomb.h
index 627fcdc..3c3f759 100644
--- a/libavcodec/golomb.h
+++ b/libavcodec/golomb.h
@@ -74,6 +74,23 @@ static inline int get_ue_golomb(GetBitContext *gb){
     }
 }
 
+ /**
+ * read unsigned exp golomb code, constraint to a max of 31
+ */
+static inline int get_ue_golomb_31(GetBitContext *gb){
+    unsigned int buf;
+
+    OPEN_READER(re, gb);
+    UPDATE_CACHE(re, gb);
+    buf=GET_CACHE(re, gb);
+
+    buf >>= 32 - 9;
+    LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
+    CLOSE_READER(re, gb);
+
+    return ff_ue_golomb_vlc_code[buf];
+}
+
 static inline int svq3_get_ue_golomb(GetBitContext *gb){
     uint32_t buf;
 
diff --git a/libavcodec/h263.c b/libavcodec/h263.c
index f72f440..781c007 100644
--- a/libavcodec/h263.c
+++ b/libavcodec/h263.c
@@ -810,7 +810,7 @@ static inline int get_p_cbp(MpegEncContext * s,
         for (i = 0; i < 6; i++) {
             if (s->block_last_index[i] >= 0 && ((cbp >> (5 - i))&1)==0 ){
                 s->block_last_index[i]= -1;
-                memset(s->block[i], 0, sizeof(DCTELEM)*64);
+                s->dsp.clear_block(s->block[i]);
             }
         }
     }else{
@@ -853,7 +853,7 @@ static inline int get_b_cbp(MpegEncContext * s, DCTELEM block[6][64],
         for (i = 0; i < 6; i++) {
             if (s->block_last_index[i] >= 0 && ((cbp >> (5 - i))&1)==0 ){
                 s->block_last_index[i]= -1;
-                memset(s->block[i], 0, sizeof(DCTELEM)*64);
+                s->dsp.clear_block(s->block[i]);
             }
         }
     }else{
@@ -1485,17 +1485,17 @@ void ff_h263_loop_filter(MpegEncContext * s){
         qp_c= 0;
 
     if(s->mb_y){
-        int qp_dt, qp_t, qp_tc;
+        int qp_dt, qp_tt, qp_tc;
 
         if(IS_SKIP(s->current_picture.mb_type[xy-s->mb_stride]))
-            qp_t=0;
+            qp_tt=0;
         else
-            qp_t= s->current_picture.qscale_table[xy-s->mb_stride];
+            qp_tt= s->current_picture.qscale_table[xy-s->mb_stride];
 
         if(qp_c)
             qp_tc= qp_c;
         else
-            qp_tc= qp_t;
+            qp_tc= qp_tt;
 
         if(qp_tc){
             const int chroma_qp= s->chroma_qscale_table[qp_tc];
@@ -1506,12 +1506,12 @@ void ff_h263_loop_filter(MpegEncContext * s){
             s->dsp.h263_v_loop_filter(dest_cr , uvlinesize, chroma_qp);
         }
 
-        if(qp_t)
-            s->dsp.h263_h_loop_filter(dest_y-8*linesize+8  ,   linesize, qp_t);
+        if(qp_tt)
+            s->dsp.h263_h_loop_filter(dest_y-8*linesize+8  ,   linesize, qp_tt);
 
         if(s->mb_x){
-            if(qp_t || IS_SKIP(s->current_picture.mb_type[xy-1-s->mb_stride]))
-                qp_dt= qp_t;
+            if(qp_tt || IS_SKIP(s->current_picture.mb_type[xy-1-s->mb_stride]))
+                qp_dt= qp_tt;
             else
                 qp_dt= s->current_picture.qscale_table[xy-1-s->mb_stride];
 
@@ -4651,7 +4651,7 @@ retry:
                 rl = &rl_intra_aic;
                 i = 0;
                 s->gb= gb;
-                memset(block, 0, sizeof(DCTELEM)*64);
+                s->dsp.clear_block(block);
                 goto retry;
             }
             av_log(s->avctx, AV_LOG_ERROR, "run overflow at %dx%d i:%d\n", s->mb_x, s->mb_y, s->mb_intra);
@@ -5666,7 +5666,58 @@ static int decode_vol_header(MpegEncContext *s, GetBitContext *gb){
              s->quarter_sample= get_bits1(gb);
         else s->quarter_sample=0;
 
-        if(!get_bits1(gb)) av_log(s->avctx, AV_LOG_ERROR, "Complexity estimation not supported\n");
+        if(!get_bits1(gb)){
+            int pos= get_bits_count(gb);
+            int estimation_method= get_bits(gb, 2);
+            if(estimation_method<2){
+                if(!get_bits1(gb)){
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //opaque
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //transparent
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //intra_cae
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //inter_cae
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //no_update
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //upampling
+                }
+                if(!get_bits1(gb)){
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //intra_blocks
+                    s->cplx_estimation_trash_p += 8*get_bits1(gb); //inter_blocks
+                    s->cplx_estimation_trash_p += 8*get_bits1(gb); //inter4v_blocks
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //not coded blocks
+                }
+                if(!check_marker(gb, "in complexity estimation part 1")){
+                    skip_bits_long(gb, pos - get_bits_count(gb));
+                    goto no_cplx_est;
+                }
+                if(!get_bits1(gb)){
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //dct_coeffs
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //dct_lines
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //vlc_syms
+                    s->cplx_estimation_trash_i += 4*get_bits1(gb); //vlc_bits
+                }
+                if(!get_bits1(gb)){
+                    s->cplx_estimation_trash_p += 8*get_bits1(gb); //apm
+                    s->cplx_estimation_trash_p += 8*get_bits1(gb); //npm
+                    s->cplx_estimation_trash_b += 8*get_bits1(gb); //interpolate_mc_q
+                    s->cplx_estimation_trash_p += 8*get_bits1(gb); //forwback_mc_q
+                    s->cplx_estimation_trash_p += 8*get_bits1(gb); //halfpel2
+                    s->cplx_estimation_trash_p += 8*get_bits1(gb); //halfpel4
+                }
+                if(!check_marker(gb, "in complexity estimation part 2")){
+                    skip_bits_long(gb, pos - get_bits_count(gb));
+                    goto no_cplx_est;
+                }
+                if(estimation_method==1){
+                    s->cplx_estimation_trash_i += 8*get_bits1(gb); //sadct
+                    s->cplx_estimation_trash_p += 8*get_bits1(gb); //qpel
+                }
+            }else
+                av_log(s->avctx, AV_LOG_ERROR, "Invalid Complexity estimation method %d\n", estimation_method);
+        }else{
+no_cplx_est:
+            s->cplx_estimation_trash_i=
+            s->cplx_estimation_trash_p=
+            s->cplx_estimation_trash_b= 0;
+        }
 
         s->resync_marker= !get_bits1(gb); /* resync_marker_disabled */
 
@@ -5903,6 +5954,12 @@ static int decode_vop_header(MpegEncContext *s, GetBitContext *gb){
 //FIXME complexity estimation stuff
 
      if (s->shape != BIN_ONLY_SHAPE) {
+         skip_bits_long(gb, s->cplx_estimation_trash_i);
+         if(s->pict_type != FF_I_TYPE)
+            skip_bits_long(gb, s->cplx_estimation_trash_p);
+         if(s->pict_type == FF_B_TYPE)
+            skip_bits_long(gb, s->cplx_estimation_trash_b);
+
          s->intra_dc_threshold= mpeg4_dc_threshold[ get_bits(gb, 3) ];
          if(!s->progressive_sequence){
              s->top_field_first= get_bits1(gb);
@@ -5951,12 +6008,12 @@ static int decode_vop_header(MpegEncContext *s, GetBitContext *gb){
              s->b_code=1;
 
          if(s->avctx->debug&FF_DEBUG_PICT_INFO){
-             av_log(s->avctx, AV_LOG_DEBUG, "qp:%d fc:%d,%d %s size:%d pro:%d alt:%d top:%d %spel part:%d resync:%d w:%d a:%d rnd:%d vot:%d%s dc:%d\n",
+             av_log(s->avctx, AV_LOG_DEBUG, "qp:%d fc:%d,%d %s size:%d pro:%d alt:%d top:%d %spel part:%d resync:%d w:%d a:%d rnd:%d vot:%d%s dc:%d ce:%d/%d/%d\n",
                  s->qscale, s->f_code, s->b_code,
                  s->pict_type == FF_I_TYPE ? "I" : (s->pict_type == FF_P_TYPE ? "P" : (s->pict_type == FF_B_TYPE ? "B" : "S")),
                  gb->size_in_bits,s->progressive_sequence, s->alternate_scan, s->top_field_first,
                  s->quarter_sample ? "q" : "h", s->data_partitioning, s->resync_marker, s->num_sprite_warping_points,
-                 s->sprite_warping_accuracy, 1-s->no_rounding, s->vo_type, s->vol_control_parameters ? " VOLC" : " ", s->intra_dc_threshold);
+                 s->sprite_warping_accuracy, 1-s->no_rounding, s->vo_type, s->vol_control_parameters ? " VOLC" : " ", s->intra_dc_threshold, s->cplx_estimation_trash_i, s->cplx_estimation_trash_p, s->cplx_estimation_trash_b);
          }
 
          if(!s->scalability){
@@ -6008,7 +6065,7 @@ int ff_mpeg4_decode_picture_header(MpegEncContext * s, GetBitContext *gb)
     if(s->codec_tag == ff_get_fourcc("WV1F") && show_bits(gb, 24) == 0x575630){
         skip_bits(gb, 24);
         if(get_bits(gb, 8) == 0xF0)
-            return decode_vop_header(s, gb);
+            goto end;
     }
 
     startcode = 0xff;
@@ -6071,12 +6128,17 @@ int ff_mpeg4_decode_picture_header(MpegEncContext * s, GetBitContext *gb)
             mpeg4_decode_gop_header(s, gb);
         }
         else if(startcode == VOP_STARTCODE){
-            return decode_vop_header(s, gb);
+            break;
         }
 
         align_get_bits(gb);
         startcode = 0xff;
     }
+end:
+    if(s->flags& CODEC_FLAG_LOW_DELAY)
+        s->low_delay=1;
+    s->avctx->has_b_frames= !s->low_delay;
+    return decode_vop_header(s, gb);
 }
 
 /* don't understand why they choose a different header ! */
diff --git a/libavcodec/h263dec.c b/libavcodec/h263dec.c
index 141d153..ae318b5 100644
--- a/libavcodec/h263dec.c
+++ b/libavcodec/h263dec.c
@@ -402,9 +402,6 @@ retry:
             ret = ff_mpeg4_decode_picture_header(s, &gb);
         }
         ret = ff_mpeg4_decode_picture_header(s, &s->gb);
-
-        if(s->flags& CODEC_FLAG_LOW_DELAY)
-            s->low_delay=1;
     } else if (s->codec_id == CODEC_ID_H263I) {
         ret = intel_h263_decode_picture_header(s);
     } else if (s->h263_flv) {
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 915c078..a8bd062 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -33,10 +33,11 @@
 #include "h264_parser.h"
 #include "golomb.h"
 #include "rectangle.h"
+#include "vdpau_internal.h"
 
 #include "cabac.h"
 #ifdef ARCH_X86
-#include "i386/h264_i386.h"
+#include "x86/h264_i386.h"
 #endif
 
 //#undef NDEBUG
@@ -101,12 +102,15 @@ static const int left_block_options[4][8]={
     {0,2,0,2,7,10,7,10}
 };
 
+#define LEVEL_TAB_BITS 8
+static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
+
 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
     MpegEncContext * const s = &h->s;
     const int mb_xy= h->mb_xy;
     int topleft_xy, top_xy, topright_xy, left_xy[2];
     int topleft_type, top_type, topright_type, left_type[2];
-    int * left_block;
+    const int * left_block;
     int topleft_partition= -1;
     int i;
 
@@ -128,46 +132,34 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
         const int top_pair_xy      = pair_xy     - s->mb_stride;
         const int topleft_pair_xy  = top_pair_xy - 1;
         const int topright_pair_xy = top_pair_xy + 1;
-        const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
-        const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
-        const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
-        const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
-        const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
+        const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
+        const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
+        const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
+        const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
+        const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
         const int bottom = (s->mb_y & 1);
-        tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
-        if (bottom
-                ? !curr_mb_frame_flag // bottom macroblock
-                : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
-                ) {
+        tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
+
+        if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
             top_xy -= s->mb_stride;
         }
-        if (bottom
-                ? !curr_mb_frame_flag // bottom macroblock
-                : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
-                ) {
+        if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
             topleft_xy -= s->mb_stride;
-        } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
+        } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
             topleft_xy += s->mb_stride;
             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
             topleft_partition = 0;
         }
-        if (bottom
-                ? !curr_mb_frame_flag // bottom macroblock
-                : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
-                ) {
+        if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
             topright_xy -= s->mb_stride;
         }
-        if (left_mb_frame_flag != curr_mb_frame_flag) {
+        if (left_mb_field_flag != curr_mb_field_flag) {
             left_xy[1] = left_xy[0] = pair_xy - 1;
-            if (curr_mb_frame_flag) {
-                if (bottom) {
-                    left_block = left_block_options[1];
-                } else {
-                    left_block= left_block_options[2];
-                }
-            } else {
+            if (curr_mb_field_flag) {
                 left_xy[1] += s->mb_stride;
                 left_block = left_block_options[3];
+            } else {
+                left_block= left_block_options[2 - bottom];
             }
         }
     }
@@ -589,13 +581,13 @@ static inline int check_intra4x4_pred_mode(H264Context *h){
         static const int mask[4]={0x8000,0x2000,0x80,0x20};
         for(i=0; i<4; i++){
             if(!(h->left_samples_available&mask[i])){
-            int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
-            if(status<0){
-                av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
-                return -1;
-            } else if(status){
-                h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
-            }
+                int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
+                if(status<0){
+                    av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
+                    return -1;
+                } else if(status){
+                    h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
+                }
             }
         }
     }
@@ -882,8 +874,8 @@ static inline void pred_pskip_motion(H264Context * const h, int * const mx, int
     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 
     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
-       || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
-       || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
+       || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
+       || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 
         *mx = *my = 0;
         return;
@@ -1387,9 +1379,26 @@ static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_le
     for(i=0; i<length; i++)
         printf("%2X ", src[i]);
 #endif
+
+#ifdef HAVE_FAST_UNALIGNED
+# ifdef HAVE_FAST_64BIT
+#   define RS 7
+    for(i=0; i+1<length; i+=9){
+        if(!((~*(uint64_t*)(src+i) & (*(uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
+# else
+#   define RS 3
+    for(i=0; i+1<length; i+=5){
+        if(!((~*(uint32_t*)(src+i) & (*(uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
+# endif
+            continue;
+        if(i>0 && !src[i]) i--;
+        while(src[i]) i++;
+#else
+#   define RS 0
     for(i=0; i+1<length; i+=2){
         if(src[i]) continue;
         if(i>0 && src[i-1]==0) i--;
+#endif
         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
             if(src[i+2]!=3){
                 /* startcode, so we must be past the end */
@@ -1397,6 +1406,7 @@ static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_le
             }
             break;
         }
+        i-= RS;
     }
 
     if(i>=length-1){ //no escaped 0
@@ -1406,7 +1416,7 @@ static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_le
     }
 
     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
-    h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
+    h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
     dst= h->rbsp_buffer[bufidx];
 
     if (dst == NULL){
@@ -1414,21 +1424,30 @@ static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_le
     }
 
 //printf("decoding esc\n");
-    si=di=0;
-    while(si<length){
+    memcpy(dst, src, i);
+    si=di=i;
+    while(si+2<length){
         //remove escapes (very rare 1:2^22)
-        if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
+        if(src[si+2]>3){
+            dst[di++]= src[si++];
+            dst[di++]= src[si++];
+        }else if(src[si]==0 && src[si+1]==0){
             if(src[si+2]==3){ //escape
                 dst[di++]= 0;
                 dst[di++]= 0;
                 si+=3;
                 continue;
             }else //next start code
-                break;
+                goto nsc;
         }
 
         dst[di++]= src[si++];
     }
+    while(si<length)
+        dst[di++]= src[si++];
+nsc:
+
+    memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
 
     *dst_length= di;
     *consumed= si + 1;//+1 for the header
@@ -1587,85 +1606,6 @@ static inline int get_chroma_qp(H264Context *h, int t, int qscale){
     return h->pps.chroma_qp_table[t][qscale];
 }
 
-//FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
-//FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
-static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
-    int i;
-    const int * const quant_table= quant_coeff[qscale];
-    const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
-    const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
-    const unsigned int threshold2= (threshold1<<1);
-    int last_non_zero;
-
-    if(separate_dc){
-        if(qscale<=18){
-            //avoid overflows
-            const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
-            const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
-            const unsigned int dc_threshold2= (dc_threshold1<<1);
-
-            int level= block[0]*quant_coeff[qscale+18][0];
-            if(((unsigned)(level+dc_threshold1))>dc_threshold2){
-                if(level>0){
-                    level= (dc_bias + level)>>(QUANT_SHIFT-2);
-                    block[0]= level;
-                }else{
-                    level= (dc_bias - level)>>(QUANT_SHIFT-2);
-                    block[0]= -level;
-                }
-//                last_non_zero = i;
-            }else{
-                block[0]=0;
-            }
-        }else{
-            const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
-            const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
-            const unsigned int dc_threshold2= (dc_threshold1<<1);
-
-            int level= block[0]*quant_table[0];
-            if(((unsigned)(level+dc_threshold1))>dc_threshold2){
-                if(level>0){
-                    level= (dc_bias + level)>>(QUANT_SHIFT+1);
-                    block[0]= level;
-                }else{
-                    level= (dc_bias - level)>>(QUANT_SHIFT+1);
-                    block[0]= -level;
-                }
-//                last_non_zero = i;
-            }else{
-                block[0]=0;
-            }
-        }
-        last_non_zero= 0;
-        i=1;
-    }else{
-        last_non_zero= -1;
-        i=0;
-    }
-
-    for(; i<16; i++){
-        const int j= scantable[i];
-        int level= block[j]*quant_table[j];
-
-//        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
-//           || bias-level >= (1<<(QMAT_SHIFT - 3))){
-        if(((unsigned)(level+threshold1))>threshold2){
-            if(level>0){
-                level= (bias + level)>>QUANT_SHIFT;
-                block[j]= level;
-            }else{
-                level= (bias - level)>>QUANT_SHIFT;
-                block[j]= -level;
-            }
-            last_non_zero = i;
-        }else{
-            block[j]=0;
-        }
-    }
-
-    return last_non_zero;
-}
-
 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                            int src_x_offset, int src_y_offset,
@@ -1684,9 +1624,6 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
     const int pic_width  = 16*s->mb_width;
     const int pic_height = 16*s->mb_height >> MB_FIELD;
 
-    if(!pic->data[0]) //FIXME this is unacceptable, some sensible error concealment must be done for missing reference frames
-        return;
-
     if(mx&7) extra_width -= 3;
     if(my&7) extra_height -= 3;
 
@@ -1949,6 +1886,31 @@ static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t
     prefetch_motion(h, 1);
 }
 
+static av_cold void init_cavlc_level_tab(void){
+    int suffix_length, mask;
+    unsigned int i;
+
+    for(suffix_length=0; suffix_length<7; suffix_length++){
+        for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
+            int prefix= LEVEL_TAB_BITS - av_log2(2*i);
+            int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
+
+            mask= -(level_code&1);
+            level_code= (((2+level_code)>>1) ^ mask) - mask;
+            if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
+                cavlc_level_tab[suffix_length][i][0]= level_code;
+                cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
+            }else if(prefix + 1 <= LEVEL_TAB_BITS){
+                cavlc_level_tab[suffix_length][i][0]= prefix+100;
+                cavlc_level_tab[suffix_length][i][1]= prefix + 1;
+            }else{
+                cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
+                cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
+            }
+        }
+    }
+}
+
 static av_cold void decode_init_vlc(void){
     static int done = 0;
 
@@ -2015,6 +1977,8 @@ static av_cold void decode_init_vlc(void){
                  &run_len [6][0], 1, 1,
                  &run_bits[6][0], 1, 1,
                  INIT_VLC_USE_NEW_STATIC);
+
+        init_cavlc_level_tab();
     }
 }
 
@@ -2200,6 +2164,8 @@ static av_cold void common_init(H264Context *h){
     s->unrestricted_mv=1;
     s->decode=1; //FIXME
 
+    dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
+
     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
 }
@@ -2223,6 +2189,8 @@ static av_cold int decode_init(AVCodecContext *avctx){
 
     if(avctx->codec_id == CODEC_ID_SVQ3)
         avctx->pix_fmt= PIX_FMT_YUVJ420P;
+    else if(avctx->codec_id == CODEC_ID_H264_VDPAU)
+        avctx->pix_fmt= PIX_FMT_VDPAU_H264;
     else
         avctx->pix_fmt= PIX_FMT_YUV420P;
 
@@ -2445,13 +2413,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
     int linesize, uvlinesize /*dct_offset*/;
     int i;
     int *block_offset = &h->block_offset[0];
-    const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
+    const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
+    /* is_h264 should always be true if SVQ3 is disabled. */
+    const int is_h264 = !ENABLE_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
 
-    dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
-    dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
-    dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
+    dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
+    dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
+    dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
 
     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
@@ -2488,17 +2458,6 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
 //        dct_offset = s->linesize * 16;
     }
 
-    if(transform_bypass){
-        idct_dc_add =
-        idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
-    }else if(IS_8x8DCT(mb_type)){
-        idct_dc_add = s->dsp.h264_idct8_dc_add;
-        idct_add = s->dsp.h264_idct8_add;
-    }else{
-        idct_dc_add = s->dsp.h264_idct_dc_add;
-        idct_add = s->dsp.h264_idct_add;
-    }
-
     if (!simple && IS_INTRA_PCM(mb_type)) {
         for (i=0; i<16; i++) {
             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
@@ -2520,47 +2479,70 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
             if(IS_INTRA4x4(mb_type)){
                 if(simple || !s->encoding){
                     if(IS_8x8DCT(mb_type)){
+                        if(transform_bypass){
+                            idct_dc_add =
+                            idct_add    = s->dsp.add_pixels8;
+                        }else{
+                            idct_dc_add = s->dsp.h264_idct8_dc_add;
+                            idct_add    = s->dsp.h264_idct8_add;
+                        }
                         for(i=0; i<16; i+=4){
                             uint8_t * const ptr= dest_y + block_offset[i];
                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
-                            const int nnz = h->non_zero_count_cache[ scan8[i] ];
-                            h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
-                                                   (h->topright_samples_available<<i)&0x4000, linesize);
-                            if(nnz){
-                                if(nnz == 1 && h->mb[i*16])
-                                    idct_dc_add(ptr, h->mb + i*16, linesize);
-                                else
-                                    idct_add(ptr, h->mb + i*16, linesize);
+                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
+                                h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
+                            }else{
+                                const int nnz = h->non_zero_count_cache[ scan8[i] ];
+                                h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
+                                                            (h->topright_samples_available<<i)&0x4000, linesize);
+                                if(nnz){
+                                    if(nnz == 1 && h->mb[i*16])
+                                        idct_dc_add(ptr, h->mb + i*16, linesize);
+                                    else
+                                        idct_add   (ptr, h->mb + i*16, linesize);
+                                }
                             }
                         }
-                    }else
-                    for(i=0; i<16; i++){
-                        uint8_t * const ptr= dest_y + block_offset[i];
-                        uint8_t *topright;
-                        const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
-                        int nnz, tr;
-
-                        if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
-                            const int topright_avail= (h->topright_samples_available<<i)&0x8000;
-                            assert(mb_y || linesize <= block_offset[i]);
-                            if(!topright_avail){
-                                tr= ptr[3 - linesize]*0x01010101;
-                                topright= (uint8_t*) &tr;
-                            }else
-                                topright= ptr + 4 - linesize;
-                        }else
-                            topright= NULL;
-
-                        h->hpc.pred4x4[ dir ](ptr, topright, linesize);
-                        nnz = h->non_zero_count_cache[ scan8[i] ];
-                        if(nnz){
-                            if(is_h264){
-                                if(nnz == 1 && h->mb[i*16])
-                                    idct_dc_add(ptr, h->mb + i*16, linesize);
-                                else
-                                    idct_add(ptr, h->mb + i*16, linesize);
-                            }else
-                                svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
+                    }else{
+                        if(transform_bypass){
+                            idct_dc_add =
+                            idct_add    = s->dsp.add_pixels4;
+                        }else{
+                            idct_dc_add = s->dsp.h264_idct_dc_add;
+                            idct_add    = s->dsp.h264_idct_add;
+                        }
+                        for(i=0; i<16; i++){
+                            uint8_t * const ptr= dest_y + block_offset[i];
+                            const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
+
+                            if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
+                                h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
+                            }else{
+                                uint8_t *topright;
+                                int nnz, tr;
+                                if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
+                                    const int topright_avail= (h->topright_samples_available<<i)&0x8000;
+                                    assert(mb_y || linesize <= block_offset[i]);
+                                    if(!topright_avail){
+                                        tr= ptr[3 - linesize]*0x01010101;
+                                        topright= (uint8_t*) &tr;
+                                    }else
+                                        topright= ptr + 4 - linesize;
+                                }else
+                                    topright= NULL;
+
+                                h->hpc.pred4x4[ dir ](ptr, topright, linesize);
+                                nnz = h->non_zero_count_cache[ scan8[i] ];
+                                if(nnz){
+                                    if(is_h264){
+                                        if(nnz == 1 && h->mb[i*16])
+                                            idct_dc_add(ptr, h->mb + i*16, linesize);
+                                        else
+                                            idct_add   (ptr, h->mb + i*16, linesize);
+                                    }else
+                                        svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
+                                }
+                            }
                         }
                     }
                 }
@@ -2585,21 +2567,32 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
         if(!IS_INTRA4x4(mb_type)){
             if(is_h264){
                 if(IS_INTRA16x16(mb_type)){
-                    for(i=0; i<16; i++){
-                        if(h->non_zero_count_cache[ scan8[i] ])
-                            idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
-                        else if(h->mb[i*16])
-                            idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                    if(transform_bypass){
+                        if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
+                            h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
+                        }else{
+                            for(i=0; i<16; i++){
+                                if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
+                                    s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
+                            }
+                        }
+                    }else{
+                         s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
                     }
-                }else{
-                    const int di = IS_8x8DCT(mb_type) ? 4 : 1;
-                    for(i=0; i<16; i+=di){
-                        int nnz = h->non_zero_count_cache[ scan8[i] ];
-                        if(nnz){
-                            if(nnz==1 && h->mb[i*16])
-                                idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
-                            else
+                }else if(h->cbp&15){
+                    if(transform_bypass){
+                        const int di = IS_8x8DCT(mb_type) ? 4 : 1;
+                        idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
+                        for(i=0; i<16; i+=di){
+                            if(h->non_zero_count_cache[ scan8[i] ]){
                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
+                            }
+                        }
+                    }else{
+                        if(IS_8x8DCT(mb_type)){
+                            s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
+                        }else{
+                            s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
                         }
                     }
                 }
@@ -2613,33 +2606,45 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
             }
         }
 
-        if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
+        if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
             uint8_t *dest[2] = {dest_cb, dest_cr};
             if(transform_bypass){
-                idct_add = idct_dc_add = s->dsp.add_pixels4;
+                if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
+                }else{
+                    idct_add = s->dsp.add_pixels4;
+                    for(i=16; i<16+8; i++){
+                        if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
+                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
+                    }
+                }
             }else{
-                idct_add = s->dsp.h264_idct_add;
-                idct_dc_add = s->dsp.h264_idct_dc_add;
                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
-            }
-            if(is_h264){
-                for(i=16; i<16+8; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ])
-                        idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
-                    else if(h->mb[i*16])
-                        idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
-                }
-            }else{
-                for(i=16; i<16+8; i++){
-                    if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
-                        uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
-                        svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
+                if(is_h264){
+                    idct_add = s->dsp.h264_idct_add;
+                    idct_dc_add = s->dsp.h264_idct_dc_add;
+                    for(i=16; i<16+8; i++){
+                        if(h->non_zero_count_cache[ scan8[i] ])
+                            idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
+                        else if(h->mb[i*16])
+                            idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
+                    }
+                }else{
+                    for(i=16; i<16+8; i++){
+                        if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
+                            uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
+                            svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
+                        }
                     }
                 }
             }
         }
     }
+    if(h->cbp || IS_INTRA(mb_type))
+        s->dsp.clear_blocks(h->mb);
+
     if(h->deblocking_filter) {
         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
@@ -2671,8 +2676,7 @@ static void hl_decode_mb(H264Context *h){
     MpegEncContext * const s = &h->s;
     const int mb_xy= h->mb_xy;
     const int mb_type= s->current_picture.mb_type[mb_xy];
-    int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 ||
-                    (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding) || ENABLE_SMALL;
+    int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
 
     if(ENABLE_H264_ENCODER && !s->decode)
         return;
@@ -2848,7 +2852,7 @@ static int decode_ref_pic_list_reordering(H264Context *h){
             int pred= h->curr_pic_num;
 
             for(index=0; ; index++){
-                unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
+                unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
                 unsigned int pic_id;
                 int i;
                 Picture *ref = NULL;
@@ -3414,7 +3418,7 @@ static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
     }else{
         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
             for(i= 0; i<MAX_MMCO_COUNT; i++) {
-                MMCOOpcode opcode= get_ue_golomb(gb);
+                MMCOOpcode opcode= get_ue_golomb_31(gb);
 
                 h->mmco[i].opcode= opcode;
                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
@@ -3425,7 +3429,7 @@ static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
                     }*/
                 }
                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
-                    unsigned int long_arg= get_ue_golomb(gb);
+                    unsigned int long_arg= get_ue_golomb_31(gb);
                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
                         return -1;
@@ -3631,7 +3635,6 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
     unsigned int first_mb_in_slice;
     unsigned int pps_id;
     int num_ref_idx_active_override_flag;
-    static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
     unsigned int slice_type, tmp, i, j;
     int default_ref_list_done = 0;
     int last_pic_structure;
@@ -3654,7 +3657,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
             s->current_picture_ptr= NULL;
     }
 
-    slice_type= get_ue_golomb(&s->gb);
+    slice_type= get_ue_golomb_31(&s->gb);
     if(slice_type > 9){
         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
         return -1;
@@ -3665,7 +3668,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
     }else
         h->slice_type_fixed=0;
 
-    slice_type= slice_type_map[ slice_type ];
+    slice_type= golomb_to_pict_type[ slice_type ];
     if (slice_type == FF_I_TYPE
         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
         default_ref_list_done = 1;
@@ -3719,6 +3722,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
         if(h != h0)
             return -1;   // width / height changed during parallelized decoding
         free_tables(h);
+        flush_dpb(s->avctx);
         MPV_common_end(s);
     }
     if (!s->context_initialized) {
@@ -3947,7 +3951,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
     direct_ref_list_init(h);
 
     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
-        tmp = get_ue_golomb(&s->gb);
+        tmp = get_ue_golomb_31(&s->gb);
         if(tmp > 2){
             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
             return -1;
@@ -3976,7 +3980,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
     h->slice_alpha_c0_offset = 0;
     h->slice_beta_offset = 0;
     if( h->pps.deblocking_filter_parameters_present ) {
-        tmp= get_ue_golomb(&s->gb);
+        tmp= get_ue_golomb_31(&s->gb);
         if(tmp > 2){
             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
             return -1;
@@ -4041,6 +4045,8 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
 
+    s->avctx->refs= h->sps.ref_frame_count;
+
     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
                h->slice_num,
@@ -4085,13 +4091,10 @@ static inline int get_level_prefix(GetBitContext *gb){
 }
 
 static inline int get_dct8x8_allowed(H264Context *h){
-    int i;
-    for(i=0; i<4; i++){
-        if(!IS_SUB_8X8(h->sub_mb_type[i])
-           || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
-            return 0;
-    }
-    return 1;
+    if(h->sps.direct_8x8_inference_flag)
+        return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
+    else
+        return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
 }
 
 /**
@@ -4138,56 +4141,81 @@ static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, in
     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
     assert(total_coeff<=16);
 
-    for(i=0; i<trailing_ones; i++){
-        level[i]= 1 - 2*get_bits1(gb);
-    }
+    i = show_bits(gb, 3);
+    skip_bits(gb, trailing_ones);
+    level[0] = 1-((i&4)>>1);
+    level[1] = 1-((i&2)   );
+    level[2] = 1-((i&1)<<1);
 
-    if(i<total_coeff) {
-        int level_code, mask;
+    if(trailing_ones<total_coeff) {
+        int mask, prefix;
         int suffix_length = total_coeff > 10 && trailing_ones < 3;
-        int prefix= get_level_prefix(gb);
+        int bitsi= show_bits(gb, LEVEL_TAB_BITS);
+        int level_code= cavlc_level_tab[suffix_length][bitsi][0];
+
+        skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
+        if(level_code >= 100){
+            prefix= level_code - 100;
+            if(prefix == LEVEL_TAB_BITS)
+                prefix += get_level_prefix(gb);
+
+            //first coefficient has suffix_length equal to 0 or 1
+            if(prefix<14){ //FIXME try to build a large unified VLC table for all this
+                if(suffix_length)
+                    level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
+                else
+                    level_code= (prefix<<suffix_length); //part
+            }else if(prefix==14){
+                if(suffix_length)
+                    level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
+                else
+                    level_code= prefix + get_bits(gb, 4); //part
+            }else{
+                level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
+                if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
+                if(prefix>=16)
+                    level_code += (1<<(prefix-3))-4096;
+            }
 
-        //first coefficient has suffix_length equal to 0 or 1
-        if(prefix<14){ //FIXME try to build a large unified VLC table for all this
-            if(suffix_length)
-                level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
-            else
-                level_code= (prefix<<suffix_length); //part
-        }else if(prefix==14){
-            if(suffix_length)
-                level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
-            else
-                level_code= prefix + get_bits(gb, 4); //part
-        }else{
-            level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
-            if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
-            if(prefix>=16)
-                level_code += (1<<(prefix-3))-4096;
-        }
+            if(trailing_ones < 3) level_code += 2;
 
-        if(trailing_ones < 3) level_code += 2;
+            suffix_length = 2;
+            mask= -(level_code&1);
+            level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
+        }else{
+            if(trailing_ones < 3) level_code += (level_code>>31)|1;
 
-        suffix_length = 1;
-        if(level_code > 5)
-            suffix_length++;
-        mask= -(level_code&1);
-        level[i]= (((2+level_code)>>1) ^ mask) - mask;
-        i++;
+            suffix_length = 1;
+            if(level_code + 3U > 6U)
+                suffix_length++;
+            level[trailing_ones]= level_code;
+        }
 
         //remaining coefficients have suffix_length > 0
-        for(;i<total_coeff;i++) {
-            static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
-            prefix = get_level_prefix(gb);
-            if(prefix<15){
-                level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
-            }else{
-                level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
-                if(prefix>=16)
-                    level_code += (1<<(prefix-3))-4096;
+        for(i=trailing_ones+1;i<total_coeff;i++) {
+            static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
+            int bitsi= show_bits(gb, LEVEL_TAB_BITS);
+            level_code= cavlc_level_tab[suffix_length][bitsi][0];
+
+            skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
+            if(level_code >= 100){
+                prefix= level_code - 100;
+                if(prefix == LEVEL_TAB_BITS){
+                    prefix += get_level_prefix(gb);
+                }
+                if(prefix<15){
+                    level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
+                }else{
+                    level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
+                    if(prefix>=16)
+                        level_code += (1<<(prefix-3))-4096;
+                }
+                mask= -(level_code&1);
+                level_code= (((2+level_code)>>1) ^ mask) - mask;
             }
-            mask= -(level_code&1);
-            level[i]= (((2+level_code)>>1) ^ mask) - mask;
-            if(level_code > suffix_limit[suffix_length])
+            level[i]= level_code;
+
+            if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
                 suffix_length++;
         }
     }
@@ -4310,8 +4338,6 @@ static int decode_mb_cavlc(H264Context *h){
 
     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
 
-    s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
-
     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
                 down the code */
@@ -4436,7 +4462,7 @@ decode_intra_mb:
                 return -1;
         }
         if(CHROMA){
-            pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
+            pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
             if(pred_mode < 0)
                 return -1;
             h->chroma_pred_mode= pred_mode;
@@ -4446,7 +4472,7 @@ decode_intra_mb:
 
         if(h->slice_type_nos == FF_B_TYPE){
             for(i=0; i<4; i++){
-                h->sub_mb_type[i]= get_ue_golomb(&s->gb);
+                h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
                 if(h->sub_mb_type[i] >=13){
                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
                     return -1;
@@ -4465,7 +4491,7 @@ decode_intra_mb:
         }else{
             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
             for(i=0; i<4; i++){
-                h->sub_mb_type[i]= get_ue_golomb(&s->gb);
+                h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
                 if(h->sub_mb_type[i] >=4){
                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
                     return -1;
@@ -4480,10 +4506,17 @@ decode_intra_mb:
             for(i=0; i<4; i++){
                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
-                    unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
-                    if(tmp>=ref_count){
-                        av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
-                        return -1;
+                    unsigned int tmp;
+                    if(ref_count == 1){
+                        tmp= 0;
+                    }else if(ref_count == 2){
+                        tmp= get_bits1(&s->gb)^1;
+                    }else{
+                        tmp= get_ue_golomb_31(&s->gb);
+                        if(tmp>=ref_count){
+                            av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
+                            return -1;
+                        }
                     }
                     ref[list][i]= tmp;
                 }else{
@@ -4549,10 +4582,16 @@ decode_intra_mb:
             for(list=0; list<h->list_count; list++){
                     unsigned int val;
                     if(IS_DIR(mb_type, 0, list)){
-                        val= get_te0_golomb(&s->gb, h->ref_count[list]);
-                        if(val >= h->ref_count[list]){
-                            av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
-                            return -1;
+                        if(h->ref_count[list]==1){
+                            val= 0;
+                        }else if(h->ref_count[list]==2){
+                            val= get_bits1(&s->gb)^1;
+                        }else{
+                            val= get_ue_golomb_31(&s->gb);
+                            if(val >= h->ref_count[list]){
+                                av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
+                                return -1;
+                            }
                         }
                     }else
                         val= LIST_NOT_USED&0xFF;
@@ -4577,10 +4616,16 @@ decode_intra_mb:
                     for(i=0; i<2; i++){
                         unsigned int val;
                         if(IS_DIR(mb_type, i, list)){
-                            val= get_te0_golomb(&s->gb, h->ref_count[list]);
-                            if(val >= h->ref_count[list]){
-                                av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
-                                return -1;
+                            if(h->ref_count[list] == 1){
+                                val= 0;
+                            }else if(h->ref_count[list] == 2){
+                                val= get_bits1(&s->gb)^1;
+                            }else{
+                                val= get_ue_golomb_31(&s->gb);
+                                if(val >= h->ref_count[list]){
+                                    av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
+                                    return -1;
+                                }
                             }
                         }else
                             val= LIST_NOT_USED&0xFF;
@@ -4608,10 +4653,16 @@ decode_intra_mb:
                     for(i=0; i<2; i++){
                         unsigned int val;
                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                            val= get_te0_golomb(&s->gb, h->ref_count[list]);
-                            if(val >= h->ref_count[list]){
-                                av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
-                                return -1;
+                            if(h->ref_count[list]==1){
+                                val= 0;
+                            }else if(h->ref_count[list]==2){
+                                val= get_bits1(&s->gb)^1;
+                            }else{
+                                val= get_ue_golomb_31(&s->gb);
+                                if(val >= h->ref_count[list]){
+                                    av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
+                                    return -1;
+                                }
                             }
                         }else
                             val= LIST_NOT_USED&0xFF;
@@ -4836,29 +4887,14 @@ static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_sl
     return mb_type;
 }
 
-static int decode_cabac_mb_type( H264Context *h ) {
+static int decode_cabac_mb_type_b( H264Context *h ) {
     MpegEncContext * const s = &h->s;
 
-    if( h->slice_type_nos == FF_I_TYPE ) {
-        return decode_cabac_intra_mb_type(h, 3, 1);
-    } else if( h->slice_type_nos == FF_P_TYPE ) {
-        if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
-            /* P-type */
-            if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
-                /* P_L0_D16x16, P_8x8 */
-                return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
-            } else {
-                /* P_L0_D8x16, P_L0_D16x8 */
-                return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
-            }
-        } else {
-            return decode_cabac_intra_mb_type(h, 17, 0) + 5;
-        }
-    } else if( h->slice_type_nos == FF_B_TYPE ) {
         const int mba_xy = h->left_mb_xy[0];
         const int mbb_xy = h->top_mb_xy;
         int ctx = 0;
         int bits;
+        assert(h->slice_type_nos == FF_B_TYPE);
 
         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
             ctx++;
@@ -4887,10 +4923,6 @@ static int decode_cabac_mb_type( H264Context *h ) {
 
         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
-    } else {
-        /* TODO SI/SP frames? */
-        return -1;
-    }
 }
 
 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
@@ -5004,26 +5036,20 @@ static int decode_cabac_mb_cbp_chroma( H264Context *h) {
     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
 }
 static int decode_cabac_mb_dqp( H264Context *h) {
-    int   ctx = 0;
+    int   ctx= h->last_qscale_diff != 0;
     int   val = 0;
 
-    if( h->last_qscale_diff != 0 )
-        ctx++;
-
     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
-        if( ctx < 2 )
-            ctx = 2;
-        else
-            ctx = 3;
+        ctx= 2+(ctx>>1);
         val++;
         if(val > 102) //prevent infinite loop
             return INT_MIN;
     }
 
     if( val&0x01 )
-        return (val + 1)/2;
+        return   (val + 1)>>1 ;
     else
-        return -(val + 1)/2;
+        return -((val + 1)>>1);
 }
 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
@@ -5075,13 +5101,9 @@ static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
 
     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
         ref++;
-        if( ctx < 4 )
-            ctx = 4;
-        else
-            ctx = 5;
+        ctx = (ctx>>2)+4;
         if(ref >= 32 /*h->ref_list[list]*/){
-            av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
-            return 0; //FIXME we should return -1 and check the return everywhere
+            return -1;
         }
     }
     return ref;
@@ -5091,14 +5113,8 @@ static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
                abs( h->mvd_cache[list][scan8[n] - 8][l] );
     int ctxbase = (l == 0) ? 40 : 47;
-    int ctx, mvd;
-
-    if( amvd < 3 )
-        ctx = 0;
-    else if( amvd > 32 )
-        ctx = 2;
-    else
-        ctx = 1;
+    int mvd;
+    int ctx = (amvd>2) + (amvd>32);
 
     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
         return 0;
@@ -5142,14 +5158,9 @@ static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx,
             nzb = (h-> top_cbp>>(6+idx))&0x01;
         }
     } else {
-        if( cat == 4 ) {
-            nza = h->non_zero_count_cache[scan8[16+idx] - 1];
-            nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
-        } else {
-            assert(cat == 1 || cat == 2);
-            nza = h->non_zero_count_cache[scan8[idx] - 1];
-            nzb = h->non_zero_count_cache[scan8[idx] - 8];
-        }
+        assert(cat == 1 || cat == 2 || cat == 4);
+        nza = h->non_zero_count_cache[scan8[idx] - 1];
+        nzb = h->non_zero_count_cache[scan8[idx] - 8];
     }
 
     if( nza > 0 )
@@ -5231,19 +5242,15 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
      *      1-> AC 16x16  n = luma4x4idx
      *      2-> Luma4x4   n = luma4x4idx
      *      3-> DC Chroma n = iCbCr
-     *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
+     *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
      *      5-> Luma8x8   n = 4 * luma8x8idx
      */
 
     /* read coded block flag */
     if( is_dc || cat != 5 ) {
         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
-            if( !is_dc ) {
-                if( cat == 4 )
-                    h->non_zero_count_cache[scan8[16+n]] = 0;
-                else
-                    h->non_zero_count_cache[scan8[n]] = 0;
-            }
+            if( !is_dc )
+                h->non_zero_count_cache[scan8[n]] = 0;
 
 #ifdef CABAC_ON_STACK
             h->cabac.range     = cc.range     ;
@@ -5298,10 +5305,8 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
     } else {
         if( cat == 5 )
             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
-        else if( cat == 4 )
-            h->non_zero_count_cache[scan8[16+n]] = coeff_count;
         else {
-            assert( cat == 1 || cat == 2 );
+            assert( cat == 1 || cat == 2 || cat == 4 );
             h->non_zero_count_cache[scan8[n]] = coeff_count;
         }
     }
@@ -5383,17 +5388,15 @@ static inline void compute_mb_neighbors(H264Context *h)
     if(FRAME_MBAFF){
         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
         const int top_pair_xy      = pair_xy     - s->mb_stride;
-        const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
-        const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
-        const int curr_mb_frame_flag = !MB_FIELD;
+        const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
+        const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
+        const int curr_mb_field_flag = MB_FIELD;
         const int bottom = (s->mb_y & 1);
-        if (bottom
-                ? !curr_mb_frame_flag // bottom macroblock
-                : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
-                ) {
+
+        if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
             h->top_mb_xy -= s->mb_stride;
         }
-        if (left_mb_frame_flag != curr_mb_frame_flag) {
+        if (!left_mb_field_flag == curr_mb_field_flag) {
             h->left_mb_xy[0] = pair_xy - 1;
         }
     } else if (FIELD_PICTURE) {
@@ -5414,8 +5417,6 @@ static int decode_mb_cabac(H264Context *h) {
 
     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
 
-    s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
-
     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
     if( h->slice_type_nos != FF_I_TYPE ) {
         int skip;
@@ -5431,9 +5432,7 @@ static int decode_mb_cabac(H264Context *h) {
             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
-                if(h->next_mb_skipped)
-                    predict_field_decoding_flag(h);
-                else
+                if(!h->next_mb_skipped)
                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
             }
 
@@ -5456,12 +5455,9 @@ static int decode_mb_cabac(H264Context *h) {
     h->prev_mb_skipped = 0;
 
     compute_mb_neighbors(h);
-    if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
-        av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
-        return -1;
-    }
 
     if( h->slice_type_nos == FF_B_TYPE ) {
+        mb_type = decode_cabac_mb_type_b( h );
         if( mb_type < 23 ){
             partition_count= b_mb_type_info[mb_type].partition_count;
             mb_type=         b_mb_type_info[mb_type].type;
@@ -5470,14 +5466,23 @@ static int decode_mb_cabac(H264Context *h) {
             goto decode_intra_mb;
         }
     } else if( h->slice_type_nos == FF_P_TYPE ) {
-        if( mb_type < 5) {
+        if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
+            /* P-type */
+            if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
+                /* P_L0_D16x16, P_8x8 */
+                mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
+            } else {
+                /* P_L0_D8x16, P_L0_D16x8 */
+                mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
+            }
             partition_count= p_mb_type_info[mb_type].partition_count;
             mb_type=         p_mb_type_info[mb_type].type;
         } else {
-            mb_type -= 5;
+            mb_type= decode_cabac_intra_mb_type(h, 17, 0);
             goto decode_intra_mb;
         }
     } else {
+        mb_type= decode_cabac_intra_mb_type(h, 3, 1);
         if(h->slice_type == FF_SI_TYPE && mb_type)
             mb_type--;
         assert(h->slice_type_nos == FF_I_TYPE);
@@ -5597,9 +5602,13 @@ decode_intra_mb:
                 for( i = 0; i < 4; i++ ) {
                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
-                        if( h->ref_count[list] > 1 )
+                        if( h->ref_count[list] > 1 ){
                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
-                        else
+                            if(ref[list][i] >= (unsigned)h->ref_count[list]){
+                                av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
+                                return -1;
+                            }
+                        }else
                             ref[list][i] = 0;
                     } else {
                         ref[list][i] = -1;
@@ -5682,7 +5691,15 @@ decode_intra_mb:
         if(IS_16X16(mb_type)){
             for(list=0; list<h->list_count; list++){
                 if(IS_DIR(mb_type, 0, list)){
-                        const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
+                    int ref;
+                    if(h->ref_count[list] > 1){
+                        ref= decode_cabac_mb_ref(h, list, 0);
+                        if(ref >= (unsigned)h->ref_count[list]){
+                            av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
+                            return -1;
+                        }
+                    }else
+                        ref=0;
                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
                 }else
                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
@@ -5705,7 +5722,15 @@ decode_intra_mb:
             for(list=0; list<h->list_count; list++){
                     for(i=0; i<2; i++){
                         if(IS_DIR(mb_type, i, list)){
-                            const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
+                            int ref;
+                            if(h->ref_count[list] > 1){
+                                ref= decode_cabac_mb_ref( h, list, 8*i );
+                                if(ref >= (unsigned)h->ref_count[list]){
+                                    av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
+                                    return -1;
+                                }
+                            }else
+                                ref=0;
                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
                         }else
                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
@@ -5732,7 +5757,15 @@ decode_intra_mb:
             for(list=0; list<h->list_count; list++){
                     for(i=0; i<2; i++){
                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
-                            const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
+                            int ref;
+                            if(h->ref_count[list] > 1){
+                                ref= decode_cabac_mb_ref( h, list, 4*i );
+                                if(ref >= (unsigned)h->ref_count[list]){
+                                    av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
+                                    return -1;
+                                }
+                            }else
+                                ref=0;
                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
                         }else
                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
@@ -5857,7 +5890,7 @@ decode_intra_mb:
                 for( i = 0; i < 4; i++ ) {
                     const int index = 16 + 4 * c + i;
                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
-                    decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
+                    decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
                 }
             }
         } else {
@@ -5886,76 +5919,32 @@ decode_intra_mb:
 
 
 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
-    int i, d;
     const int index_a = qp + h->slice_alpha_c0_offset;
     const int alpha = (alpha_table+52)[index_a];
     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
 
     if( bS[0] < 4 ) {
         int8_t tc[4];
-        for(i=0; i<4; i++)
-            tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
+        tc[0] = (tc0_table+52)[index_a][bS[0]];
+        tc[1] = (tc0_table+52)[index_a][bS[1]];
+        tc[2] = (tc0_table+52)[index_a][bS[2]];
+        tc[3] = (tc0_table+52)[index_a][bS[3]];
         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
     } else {
-        /* 16px edge length, because bS=4 is triggered by being at
-         * the edge of an intra MB, so all 4 bS are the same */
-            for( d = 0; d < 16; d++ ) {
-                const int p0 = pix[-1];
-                const int p1 = pix[-2];
-                const int p2 = pix[-3];
-
-                const int q0 = pix[0];
-                const int q1 = pix[1];
-                const int q2 = pix[2];
-
-                if( FFABS( p0 - q0 ) < alpha &&
-                    FFABS( p1 - p0 ) < beta &&
-                    FFABS( q1 - q0 ) < beta ) {
-
-                    if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
-                        if( FFABS( p2 - p0 ) < beta)
-                        {
-                            const int p3 = pix[-4];
-                            /* p0', p1', p2' */
-                            pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-                            pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
-                            pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-                        } else {
-                            /* p0' */
-                            pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                        }
-                        if( FFABS( q2 - q0 ) < beta)
-                        {
-                            const int q3 = pix[3];
-                            /* q0', q1', q2' */
-                            pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
-                            pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
-                            pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
-                        } else {
-                            /* q0' */
-                            pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                        }
-                    }else{
-                        /* p0', q0' */
-                        pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                        pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                    }
-                    tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
-                }
-                pix += stride;
-            }
+        h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
     }
 }
 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
-    int i;
     const int index_a = qp + h->slice_alpha_c0_offset;
     const int alpha = (alpha_table+52)[index_a];
     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
 
     if( bS[0] < 4 ) {
         int8_t tc[4];
-        for(i=0; i<4; i++)
-            tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
+        tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
+        tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
+        tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
+        tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
     } else {
         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
@@ -5986,7 +5975,7 @@ static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int
         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
 
         if( bS[bS_index] < 4 ) {
-            const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
+            const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
             const int p0 = pix[-1];
             const int p1 = pix[-2];
             const int p2 = pix[-3];
@@ -6080,7 +6069,7 @@ static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, in
         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
 
         if( bS[bS_index] < 4 ) {
-            const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
+            const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
             const int p0 = pix[-1];
             const int p1 = pix[-2];
             const int q0 = pix[0];
@@ -6114,75 +6103,33 @@ static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, in
 }
 
 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
-    int i, d;
     const int index_a = qp + h->slice_alpha_c0_offset;
     const int alpha = (alpha_table+52)[index_a];
     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
-    const int pix_next  = stride;
 
     if( bS[0] < 4 ) {
         int8_t tc[4];
-        for(i=0; i<4; i++)
-            tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
+        tc[0] = (tc0_table+52)[index_a][bS[0]];
+        tc[1] = (tc0_table+52)[index_a][bS[1]];
+        tc[2] = (tc0_table+52)[index_a][bS[2]];
+        tc[3] = (tc0_table+52)[index_a][bS[3]];
         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
     } else {
-        /* 16px edge length, see filter_mb_edgev */
-            for( d = 0; d < 16; d++ ) {
-                const int p0 = pix[-1*pix_next];
-                const int p1 = pix[-2*pix_next];
-                const int p2 = pix[-3*pix_next];
-                const int q0 = pix[0];
-                const int q1 = pix[1*pix_next];
-                const int q2 = pix[2*pix_next];
-
-                if( FFABS( p0 - q0 ) < alpha &&
-                    FFABS( p1 - p0 ) < beta &&
-                    FFABS( q1 - q0 ) < beta ) {
-
-                    const int p3 = pix[-4*pix_next];
-                    const int q3 = pix[ 3*pix_next];
-
-                    if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
-                        if( FFABS( p2 - p0 ) < beta) {
-                            /* p0', p1', p2' */
-                            pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
-                            pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
-                            pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
-                        } else {
-                            /* p0' */
-                            pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                        }
-                        if( FFABS( q2 - q0 ) < beta) {
-                            /* q0', q1', q2' */
-                            pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
-                            pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
-                            pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
-                        } else {
-                            /* q0' */
-                            pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                        }
-                    }else{
-                        /* p0', q0' */
-                        pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
-                        pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
-                    }
-                    tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
-                }
-                pix++;
-            }
+        h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
     }
 }
 
 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
-    int i;
     const int index_a = qp + h->slice_alpha_c0_offset;
     const int alpha = (alpha_table+52)[index_a];
     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
 
     if( bS[0] < 4 ) {
         int8_t tc[4];
-        for(i=0; i<4; i++)
-            tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
+        tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
+        tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
+        tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
+        tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
     } else {
         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
@@ -6198,7 +6145,7 @@ static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
     mb_xy = h->mb_xy;
 
     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
-1 ||
+        !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
@@ -6303,6 +6250,210 @@ static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y,
     }
 }
 
+
+static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
+    MpegEncContext * const s = &h->s;
+    int edge;
+    const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
+    const int mbm_type = s->current_picture.mb_type[mbm_xy];
+    int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
+    int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
+    int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
+
+    const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
+                              == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
+    // how often to recheck mv-based bS when iterating between edges
+    const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
+                          (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
+    // how often to recheck mv-based bS when iterating along each edge
+    const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
+
+    if (first_vertical_edge_done) {
+        start = 1;
+    }
+
+    if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
+        start = 1;
+
+    if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
+        && !IS_INTERLACED(mb_type)
+        && IS_INTERLACED(mbm_type)
+        ) {
+        // This is a special case in the norm where the filtering must
+        // be done twice (one each of the field) even if we are in a
+        // frame macroblock.
+        //
+        static const int nnz_idx[4] = {4,5,6,3};
+        unsigned int tmp_linesize   = 2 *   linesize;
+        unsigned int tmp_uvlinesize = 2 * uvlinesize;
+        int mbn_xy = mb_xy - 2 * s->mb_stride;
+        int qp;
+        int i, j;
+        int16_t bS[4];
+
+        for(j=0; j<2; j++, mbn_xy += s->mb_stride){
+            if( IS_INTRA(mb_type) ||
+                IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
+                bS[0] = bS[1] = bS[2] = bS[3] = 3;
+            } else {
+                const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
+                for( i = 0; i < 4; i++ ) {
+                    if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
+                        mbn_nnz[nnz_idx[i]] != 0 )
+                        bS[i] = 2;
+                    else
+                        bS[i] = 1;
+                }
+            }
+            // Do not use s->qscale as luma quantizer because it has not the same
+            // value in IPCM macroblocks.
+            qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
+            tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
+            { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
+            filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
+            filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
+                              ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+            filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
+                              ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+        }
+
+        start = 1;
+    }
+
+    /* Calculate bS */
+    for( edge = start; edge < edges; edge++ ) {
+        /* mbn_xy: neighbor macroblock */
+        const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
+        const int mbn_type = s->current_picture.mb_type[mbn_xy];
+        int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
+        int16_t bS[4];
+        int qp;
+
+        if( (edge&1) && IS_8x8DCT(mb_type) )
+            continue;
+
+        if( IS_INTRA(mb_type) ||
+            IS_INTRA(mbn_type) ) {
+            int value;
+            if (edge == 0) {
+                if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
+                    || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
+                ) {
+                    value = 4;
+                } else {
+                    value = 3;
+                }
+            } else {
+                value = 3;
+            }
+            bS[0] = bS[1] = bS[2] = bS[3] = value;
+        } else {
+            int i, l;
+            int mv_done;
+
+            if( edge & mask_edge ) {
+                bS[0] = bS[1] = bS[2] = bS[3] = 0;
+                mv_done = 1;
+            }
+            else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
+                bS[0] = bS[1] = bS[2] = bS[3] = 1;
+                mv_done = 1;
+            }
+            else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
+                int b_idx= 8 + 4 + edge * (dir ? 8:1);
+                int bn_idx= b_idx - (dir ? 8:1);
+                int v = 0;
+
+                for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
+                    v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
+                         FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
+                         FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
+                }
+
+                if(h->slice_type_nos == FF_B_TYPE && v){
+                    v=0;
+                    for( l = 0; !v && l < 2; l++ ) {
+                        int ln= 1-l;
+                        v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
+                            FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
+                            FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
+                    }
+                }
+
+                bS[0] = bS[1] = bS[2] = bS[3] = v;
+                mv_done = 1;
+            }
+            else
+                mv_done = 0;
+
+            for( i = 0; i < 4; i++ ) {
+                int x = dir == 0 ? edge : i;
+                int y = dir == 0 ? i    : edge;
+                int b_idx= 8 + 4 + x + 8*y;
+                int bn_idx= b_idx - (dir ? 8:1);
+
+                if( h->non_zero_count_cache[b_idx] |
+                    h->non_zero_count_cache[bn_idx] ) {
+                    bS[i] = 2;
+                }
+                else if(!mv_done)
+                {
+                    bS[i] = 0;
+                    for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
+                        if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
+                            FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
+                            FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
+                            bS[i] = 1;
+                            break;
+                        }
+                    }
+
+                    if(h->slice_type_nos == FF_B_TYPE && bS[i]){
+                        bS[i] = 0;
+                        for( l = 0; l < 2; l++ ) {
+                            int ln= 1-l;
+                            if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
+                                FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
+                                FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
+                                bS[i] = 1;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+
+            if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
+                continue;
+        }
+
+        /* Filter edge */
+        // Do not use s->qscale as luma quantizer because it has not the same
+        // value in IPCM macroblocks.
+        qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
+        //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
+        tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
+        { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
+        if( dir == 0 ) {
+            filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
+            if( (edge&1) == 0 ) {
+                filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
+                                  ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+                filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
+                                  ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+            }
+        } else {
+            filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
+            if( (edge&1) == 0 ) {
+                filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
+                                  ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+                filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
+                                  ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
+            }
+        }
+    }
+}
+
 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
     MpegEncContext * const s = &h->s;
     const int mb_xy= mb_x + mb_y*s->mb_stride;
@@ -6347,16 +6498,16 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
 
         if(IS_8x8DCT(mb_type)){
             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
-            h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;
+            h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
 
             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
-            h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
+            h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
 
             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
-            h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
+            h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
 
             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
-            h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
+            h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
         }
     }
 
@@ -6420,210 +6571,14 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
     }
-    /* dir : 0 -> vertical edge, 1 -> horizontal edge */
-    for( dir = 0; dir < 2; dir++ )
-    {
-        int edge;
-        const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
-        const int mbm_type = s->current_picture.mb_type[mbm_xy];
-        int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
-        int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
-        int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
-
-        const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
-                                  == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
-        // how often to recheck mv-based bS when iterating between edges
-        const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
-                              (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
-        // how often to recheck mv-based bS when iterating along each edge
-        const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
-
-        if (first_vertical_edge_done) {
-            start = 1;
-            first_vertical_edge_done = 0;
-        }
-
-        if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
-            start = 1;
-
-        if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
-            && !IS_INTERLACED(mb_type)
-            && IS_INTERLACED(mbm_type)
-            ) {
-            // This is a special case in the norm where the filtering must
-            // be done twice (one each of the field) even if we are in a
-            // frame macroblock.
-            //
-            static const int nnz_idx[4] = {4,5,6,3};
-            unsigned int tmp_linesize   = 2 *   linesize;
-            unsigned int tmp_uvlinesize = 2 * uvlinesize;
-            int mbn_xy = mb_xy - 2 * s->mb_stride;
-            int qp;
-            int i, j;
-            int16_t bS[4];
-
-            for(j=0; j<2; j++, mbn_xy += s->mb_stride){
-                if( IS_INTRA(mb_type) ||
-                    IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
-                    bS[0] = bS[1] = bS[2] = bS[3] = 3;
-                } else {
-                    const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
-                    for( i = 0; i < 4; i++ ) {
-                        if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
-                            mbn_nnz[nnz_idx[i]] != 0 )
-                            bS[i] = 2;
-                        else
-                            bS[i] = 1;
-                    }
-                }
-                // Do not use s->qscale as luma quantizer because it has not the same
-                // value in IPCM macroblocks.
-                qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
-                tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
-                { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
-                filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
-                filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
-                                  ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
-                filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
-                                  ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
-            }
-
-            start = 1;
-        }
-
-        /* Calculate bS */
-        for( edge = start; edge < edges; edge++ ) {
-            /* mbn_xy: neighbor macroblock */
-            const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
-            const int mbn_type = s->current_picture.mb_type[mbn_xy];
-            int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
-            int16_t bS[4];
-            int qp;
-
-            if( (edge&1) && IS_8x8DCT(mb_type) )
-                continue;
 
-            if( IS_INTRA(mb_type) ||
-                IS_INTRA(mbn_type) ) {
-                int value;
-                if (edge == 0) {
-                    if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
-                        || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
-                    ) {
-                        value = 4;
-                    } else {
-                        value = 3;
-                    }
-                } else {
-                    value = 3;
-                }
-                bS[0] = bS[1] = bS[2] = bS[3] = value;
-            } else {
-                int i, l;
-                int mv_done;
-
-                if( edge & mask_edge ) {
-                    bS[0] = bS[1] = bS[2] = bS[3] = 0;
-                    mv_done = 1;
-                }
-                else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
-                    bS[0] = bS[1] = bS[2] = bS[3] = 1;
-                    mv_done = 1;
-                }
-                else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
-                    int b_idx= 8 + 4 + edge * (dir ? 8:1);
-                    int bn_idx= b_idx - (dir ? 8:1);
-                    int v = 0;
-
-                    for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
-                        v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
-                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
-                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
-                    }
-
-                    if(h->slice_type_nos == FF_B_TYPE && v){
-                        v=0;
-                        for( l = 0; !v && l < 2; l++ ) {
-                            int ln= 1-l;
-                            v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
-                                FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
-                                FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
-                        }
-                    }
-
-                    bS[0] = bS[1] = bS[2] = bS[3] = v;
-                    mv_done = 1;
-                }
-                else
-                    mv_done = 0;
-
-                for( i = 0; i < 4; i++ ) {
-                    int x = dir == 0 ? edge : i;
-                    int y = dir == 0 ? i    : edge;
-                    int b_idx= 8 + 4 + x + 8*y;
-                    int bn_idx= b_idx - (dir ? 8:1);
-
-                    if( h->non_zero_count_cache[b_idx] != 0 ||
-                        h->non_zero_count_cache[bn_idx] != 0 ) {
-                        bS[i] = 2;
-                    }
-                    else if(!mv_done)
-                    {
-                        bS[i] = 0;
-                        for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
-                            if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
-                                FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
-                                FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
-                                bS[i] = 1;
-                                break;
-                            }
-                        }
-
-                        if(h->slice_type_nos == FF_B_TYPE && bS[i]){
-                            bS[i] = 0;
-                            for( l = 0; l < 2; l++ ) {
-                                int ln= 1-l;
-                                if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
-                                    FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
-                                    FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
-                                    bS[i] = 1;
-                                    break;
-                                }
-                            }
-                        }
-                    }
-                }
-
-                if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
-                    continue;
-            }
-
-            /* Filter edge */
-            // Do not use s->qscale as luma quantizer because it has not the same
-            // value in IPCM macroblocks.
-            qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
-            //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
-            tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
-            { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
-            if( dir == 0 ) {
-                filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
-                if( (edge&1) == 0 ) {
-                    filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
-                                      ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
-                    filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
-                                      ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
-                }
-            } else {
-                filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
-                if( (edge&1) == 0 ) {
-                    filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
-                                      ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
-                    filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
-                                      ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
-                }
-            }
-        }
-    }
+#ifdef CONFIG_SMALL
+    for( dir = 0; dir < 2; dir++ )
+        filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
+#else
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
+    filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
+#endif
 }
 
 static int decode_slice(struct AVCodecContext *avctx, void *arg){
@@ -6633,6 +6588,9 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg){
 
     s->mb_skip_run= -1;
 
+    h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
+                    (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
+
     if( h->pps.cabac ) {
         int i;
 
@@ -6669,7 +6627,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg){
             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
                 s->mb_y++;
 
-                if(ret>=0) ret = decode_mb_cabac(h);
+                ret = decode_mb_cabac(h);
 
                 if(ret>=0) hl_decode_mb(h);
                 s->mb_y--;
@@ -6913,10 +6871,16 @@ static int decode_sei(H264Context *h){
     return 0;
 }
 
-static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
+static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
     MpegEncContext * const s = &h->s;
     int cpb_count, i;
-    cpb_count = get_ue_golomb(&s->gb) + 1;
+    cpb_count = get_ue_golomb_31(&s->gb) + 1;
+
+    if(cpb_count > 32U){
+        av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
+        return -1;
+    }
+
     get_bits(&s->gb, 4); /* bit_rate_scale */
     get_bits(&s->gb, 4); /* cpb_size_scale */
     for(i=0; i<cpb_count; i++){
@@ -6928,6 +6892,7 @@ static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
     sps->time_offset_length = get_bits(&s->gb, 5);
+    return 0;
 }
 
 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
@@ -6982,31 +6947,30 @@ static inline int decode_vui_parameters(H264Context *h, SPS *sps){
 
     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
     if(sps->nal_hrd_parameters_present_flag)
-        decode_hrd_parameters(h, sps);
+        if(decode_hrd_parameters(h, sps) < 0)
+            return -1;
     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
     if(sps->vcl_hrd_parameters_present_flag)
-        decode_hrd_parameters(h, sps);
+        if(decode_hrd_parameters(h, sps) < 0)
+            return -1;
     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
         get_bits1(&s->gb);     /* low_delay_hrd_flag */
     sps->pic_struct_present_flag = get_bits1(&s->gb);
 
     sps->bitstream_restriction_flag = get_bits1(&s->gb);
     if(sps->bitstream_restriction_flag){
-        unsigned int num_reorder_frames;
         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
-        num_reorder_frames= get_ue_golomb(&s->gb);
+        sps->num_reorder_frames= get_ue_golomb(&s->gb);
         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
 
-        if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
-            av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
+        if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
+            av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
             return -1;
         }
-
-        sps->num_reorder_frames= num_reorder_frames;
     }
 
     return 0;
@@ -7056,30 +7020,10 @@ static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_s
     }
 }
 
-/**
- * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
- */
-static void *
-alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
-                    const size_t size, const char *name)
-{
-    if(id>=max) {
-        av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
-        return NULL;
-    }
-
-    if(!vec[id]) {
-        vec[id] = av_mallocz(size);
-        if(vec[id] == NULL)
-            av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
-    }
-    return vec[id];
-}
-
 static inline int decode_seq_parameter_set(H264Context *h){
     MpegEncContext * const s = &h->s;
     int profile_idc, level_idc;
-    unsigned int sps_id, tmp, mb_width, mb_height;
+    unsigned int sps_id;
     int i;
     SPS *sps;
 
@@ -7090,9 +7034,13 @@ static inline int decode_seq_parameter_set(H264Context *h){
     get_bits1(&s->gb);   //constraint_set3_flag
     get_bits(&s->gb, 4); // reserved
     level_idc= get_bits(&s->gb, 8);
-    sps_id= get_ue_golomb(&s->gb);
+    sps_id= get_ue_golomb_31(&s->gb);
 
-    sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
+    if(sps_id >= MAX_SPS_COUNT) {
+        av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
+        return -1;
+    }
+    sps= av_mallocz(sizeof(SPS));
     if(sps == NULL)
         return -1;
 
@@ -7104,7 +7052,7 @@ static inline int decode_seq_parameter_set(H264Context *h){
     sps->scaling_matrix_present = 0;
 
     if(sps->profile_idc >= 100){ //high profile
-        sps->chroma_format_idc= get_ue_golomb(&s->gb);
+        sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
         if(sps->chroma_format_idc == 3)
             get_bits1(&s->gb);  //residual_color_transform_flag
         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
@@ -7116,7 +7064,7 @@ static inline int decode_seq_parameter_set(H264Context *h){
     }
 
     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
-    sps->poc_type= get_ue_golomb(&s->gb);
+    sps->poc_type= get_ue_golomb_31(&s->gb);
 
     if(sps->poc_type == 0){ //FIXME #define
         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
@@ -7124,37 +7072,33 @@ static inline int decode_seq_parameter_set(H264Context *h){
         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
-        tmp= get_ue_golomb(&s->gb);
+        sps->poc_cycle_length                = get_ue_golomb(&s->gb);
 
-        if(tmp >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
-            av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
-            return -1;
+        if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
+            av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
+            goto fail;
         }
-        sps->poc_cycle_length= tmp;
 
         for(i=0; i<sps->poc_cycle_length; i++)
             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
     }else if(sps->poc_type != 2){
         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
-        return -1;
+        goto fail;
     }
 
-    tmp= get_ue_golomb(&s->gb);
-    if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
+    sps->ref_frame_count= get_ue_golomb_31(&s->gb);
+    if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
-        return -1;
+        goto fail;
     }
-    sps->ref_frame_count= tmp;
     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
-    mb_width= get_ue_golomb(&s->gb) + 1;
-    mb_height= get_ue_golomb(&s->gb) + 1;
-    if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
-       avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
+    sps->mb_width = get_ue_golomb(&s->gb) + 1;
+    sps->mb_height= get_ue_golomb(&s->gb) + 1;
+    if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
+       avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
-        return -1;
+        goto fail;
     }
-    sps->mb_width = mb_width;
-    sps->mb_height= mb_height;
 
     sps->frame_mbs_only_flag= get_bits1(&s->gb);
     if(!sps->frame_mbs_only_flag)
@@ -7205,7 +7149,12 @@ static inline int decode_seq_parameter_set(H264Context *h){
                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
                );
     }
+    av_free(h->sps_buffers[sps_id]);
+    h->sps_buffers[sps_id]= sps;
     return 0;
+fail:
+    av_free(sps);
+    return -1;
 }
 
 static void
@@ -7218,19 +7167,22 @@ build_qp_table(PPS *pps, int t, int index)
 
 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
     MpegEncContext * const s = &h->s;
-    unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
+    unsigned int pps_id= get_ue_golomb(&s->gb);
     PPS *pps;
 
-    pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
-    if(pps == NULL)
+    if(pps_id >= MAX_PPS_COUNT) {
+        av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
         return -1;
+    }
 
-    tmp= get_ue_golomb(&s->gb);
-    if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
-        av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
+    pps= av_mallocz(sizeof(PPS));
+    if(pps == NULL)
         return -1;
+    pps->sps_id= get_ue_golomb_31(&s->gb);
+    if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
+        av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
+        goto fail;
     }
-    pps->sps_id= tmp;
 
     pps->cabac= get_bits1(&s->gb);
     pps->pic_order_present= get_bits1(&s->gb);
@@ -7276,8 +7228,7 @@ static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
-        pps->ref_count[0]= pps->ref_count[1]= 1;
-        return -1;
+        goto fail;
     }
 
     pps->weighted_pred= get_bits1(&s->gb);
@@ -7322,7 +7273,12 @@ static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
                );
     }
 
+    av_free(h->pps_buffers[pps_id]);
+    h->pps_buffers[pps_id]= pps;
     return 0;
+fail:
+    av_free(pps);
+    return -1;
 }
 
 /**
@@ -7337,6 +7293,8 @@ static void execute_decode_slices(H264Context *h, int context_count){
     H264Context *hx;
     int i;
 
+    if(avctx->codec_id == CODEC_ID_H264_VDPAU)
+        return;
     if(context_count == 1) {
         decode_slice(avctx, &h);
     } else {
@@ -7464,8 +7422,14 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
-               && avctx->skip_frame < AVDISCARD_ALL)
-                context_count++;
+               && avctx->skip_frame < AVDISCARD_ALL){
+                if(ENABLE_H264_VDPAU_DECODER && avctx->codec_id == CODEC_ID_H264_VDPAU){
+                    static const uint8_t start_code[] = {0x00, 0x00, 0x01};
+                    ff_vdpau_h264_add_data_chunk(h, start_code, sizeof(start_code));
+                    ff_vdpau_h264_add_data_chunk(h, &buf[buf_index - consumed], consumed );
+                }else
+                    context_count++;
+            }
             break;
         case NAL_DPA:
             init_get_bits(&hx->s.gb, ptr, bit_length);
@@ -7668,6 +7632,9 @@ static int decode_frame(AVCodecContext *avctx,
         h->prev_frame_num_offset= h->frame_num_offset;
         h->prev_frame_num= h->frame_num;
 
+        if (ENABLE_H264_VDPAU_DECODER && avctx->codec_id == CODEC_ID_H264_VDPAU)
+            ff_vdpau_h264_picture_complete(h);
+
         /*
          * FIXME: Error handling code does not seem to support interlaced
          * when slices span multiple rows
@@ -8053,4 +8020,22 @@ AVCodec h264_decoder = {
     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
 };
 
+#ifdef CONFIG_H264_VDPAU_DECODER
+AVCodec h264_vdpau_decoder = {
+    "h264_vdpau",
+    CODEC_TYPE_VIDEO,
+    CODEC_ID_H264_VDPAU,
+    sizeof(H264Context),
+    decode_init,
+    NULL,
+    decode_end,
+    decode_frame,
+    CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
+    .flush= flush_dpb,
+    .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
+};
+#endif
+
+#ifdef CONFIG_SVQ3_DECODER
 #include "svq3.c"
+#endif
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index f7ea19e..40a94a0 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -92,6 +92,11 @@
 
 #define EXTENDED_SAR          255
 
+#define MB_TYPE_REF0       MB_TYPE_ACPRED //dirty but it fits in 16 bit
+#define MB_TYPE_8x8DCT     0x01000000
+#define IS_REF0(a)         ((a) & MB_TYPE_REF0)
+#define IS_8x8DCT(a)       ((a) & MB_TYPE_8x8DCT)
+
 /* NAL unit types */
 enum {
     NAL_SLICE=1,
@@ -334,7 +339,7 @@ typedef struct H264Context{
     int mb_field_decoding_flag;
     int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
 
-    unsigned int sub_mb_type[4];
+    uint16_t sub_mb_type[4];
 
     //POC stuff
     int poc_lsb;
@@ -486,6 +491,8 @@ typedef struct H264Context{
      * pic_struct in picture timing SEI message
      */
     SEI_PicStructType sei_pic_struct;
+
+    int is_complex;
 }H264Context;
 
 #endif /* AVCODEC_H264_H */
diff --git a/libavcodec/h264_mp4toannexb_bsf.c b/libavcodec/h264_mp4toannexb_bsf.c
index 03eb956..e94f8c6 100644
--- a/libavcodec/h264_mp4toannexb_bsf.c
+++ b/libavcodec/h264_mp4toannexb_bsf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Benoit Fouet <benoit.fouet at purplelabs.com>
+ * Copyright (c) 2007 Benoit Fouet <benoit.fouet at free.fr>
  *
  * This file is part of FFmpeg.
  *
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index e0973bf..cfa04c4 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -44,6 +44,18 @@ int ff_h264_find_frame_end(H264Context *h, const uint8_t *buf, int buf_size)
 
     for(i=0; i<buf_size; i++){
         if(state==7){
+#ifdef HAVE_FAST_UNALIGNED
+        /* we check i<buf_size instead of i+3/7 because its simpler
+         * and there should be FF_INPUT_BUFFER_PADDING_SIZE bytes at the end
+         */
+#    ifdef HAVE_FAST_64BIT
+            while(i<buf_size && !((~*(uint64_t*)(buf+i) & (*(uint64_t*)(buf+i) - 0x0101010101010101ULL)) & 0x8080808080808080ULL))
+                i+=8;
+#    else
+            while(i<buf_size && !((~*(uint32_t*)(buf+i) & (*(uint32_t*)(buf+i) - 0x01010101U)) & 0x80808080U))
+                i+=4;
+#    endif
+#endif
             for(; i<buf_size; i++){
                 if(!buf[i]){
                     state=2;
@@ -149,7 +161,8 @@ static void close(AVCodecParserContext *s)
 
 
 AVCodecParser h264_parser = {
-    { CODEC_ID_H264 },
+    { CODEC_ID_H264,
+      CODEC_ID_H264_VDPAU },
     sizeof(H264Context),
     NULL,
     h264_parse,
diff --git a/libavcodec/h264data.h b/libavcodec/h264data.h
index fa4bff8..20ea3bb 100644
--- a/libavcodec/h264data.h
+++ b/libavcodec/h264data.h
@@ -32,6 +32,7 @@
 #include <stdint.h>
 #include "libavutil/rational.h"
 #include "mpegvideo.h"
+#include "h264.h"
 
 
 static const AVRational pixel_aspect[17]={
@@ -57,9 +58,6 @@ static const AVRational pixel_aspect[17]={
 static const uint8_t golomb_to_pict_type[5]=
 {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
 
-static const uint8_t pict_type_to_golomb[7]=
-{-1, 2, 0, 1, -1, 4, 3};
-
 static const uint8_t chroma_qp[52]={
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,
    12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,
@@ -80,18 +78,6 @@ static const uint8_t golomb_to_inter_cbp[48]={
  17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41
 };
 
-static const uint8_t intra4x4_cbp_to_golomb[48]={
-  3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
- 16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
- 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0
-};
-
-static const uint8_t inter_cbp_to_golomb[48]={
-  0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
-  1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
-  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
-};
-
 static const uint8_t golomb_to_inter_cbp_gray[16]={
  0, 1, 2, 4, 8, 3, 5,10,12,15, 7,11,13,14, 6, 9,
 };
@@ -377,12 +363,6 @@ static const uint8_t field_scan8x8_cavlc[64]={
  6+3*8, 6+5*8, 7+3*8, 7+7*8,
 };
 
-#define MB_TYPE_REF0       MB_TYPE_ACPRED //dirty but it fits in 16bit
-#define MB_TYPE_8x8DCT     0x01000000
-#define IS_REF0(a)       ((a)&MB_TYPE_REF0)
-#define IS_8x8DCT(a)     ((a)&MB_TYPE_8x8DCT)
-
-
 typedef struct IMbInfo{
     uint16_t type;
     uint8_t pred_mode;
@@ -533,63 +513,6 @@ static const uint8_t dequant8_coeff_init[6][6]={
   {36,32,58,34,46,43},
 };
 
-#define QUANT_SHIFT 22
-
-static const int quant_coeff[52][16]={
-    { 419430,258111,419430,258111,258111,167772,258111,167772,419430,258111,419430,258111,258111,167772,258111,167772,},
-    { 381300,239675,381300,239675,239675,149131,239675,149131,381300,239675,381300,239675,239675,149131,239675,149131,},
-    { 322639,209715,322639,209715,209715,134218,209715,134218,322639,209715,322639,209715,209715,134218,209715,134218,},
-    { 299593,186414,299593,186414,186414,116711,186414,116711,299593,186414,299593,186414,186414,116711,186414,116711,},
-    { 262144,167772,262144,167772,167772,107374,167772,107374,262144,167772,262144,167772,167772,107374,167772,107374,},
-    { 233017,145889,233017,145889,145889, 92564,145889, 92564,233017,145889,233017,145889,145889, 92564,145889, 92564,},
-    { 209715,129056,209715,129056,129056, 83886,129056, 83886,209715,129056,209715,129056,129056, 83886,129056, 83886,},
-    { 190650,119837,190650,119837,119837, 74565,119837, 74565,190650,119837,190650,119837,119837, 74565,119837, 74565,},
-    { 161319,104858,161319,104858,104858, 67109,104858, 67109,161319,104858,161319,104858,104858, 67109,104858, 67109,},
-    { 149797, 93207,149797, 93207, 93207, 58356, 93207, 58356,149797, 93207,149797, 93207, 93207, 58356, 93207, 58356,},
-    { 131072, 83886,131072, 83886, 83886, 53687, 83886, 53687,131072, 83886,131072, 83886, 83886, 53687, 83886, 53687,},
-    { 116508, 72944,116508, 72944, 72944, 46282, 72944, 46282,116508, 72944,116508, 72944, 72944, 46282, 72944, 46282,},
-    { 104858, 64528,104858, 64528, 64528, 41943, 64528, 41943,104858, 64528,104858, 64528, 64528, 41943, 64528, 41943,},
-    {  95325, 59919, 95325, 59919, 59919, 37283, 59919, 37283, 95325, 59919, 95325, 59919, 59919, 37283, 59919, 37283,},
-    {  80660, 52429, 80660, 52429, 52429, 33554, 52429, 33554, 80660, 52429, 80660, 52429, 52429, 33554, 52429, 33554,},
-    {  74898, 46603, 74898, 46603, 46603, 29178, 46603, 29178, 74898, 46603, 74898, 46603, 46603, 29178, 46603, 29178,},
-    {  65536, 41943, 65536, 41943, 41943, 26844, 41943, 26844, 65536, 41943, 65536, 41943, 41943, 26844, 41943, 26844,},
-    {  58254, 36472, 58254, 36472, 36472, 23141, 36472, 23141, 58254, 36472, 58254, 36472, 36472, 23141, 36472, 23141,},
-    {  52429, 32264, 52429, 32264, 32264, 20972, 32264, 20972, 52429, 32264, 52429, 32264, 32264, 20972, 32264, 20972,},
-    {  47663, 29959, 47663, 29959, 29959, 18641, 29959, 18641, 47663, 29959, 47663, 29959, 29959, 18641, 29959, 18641,},
-    {  40330, 26214, 40330, 26214, 26214, 16777, 26214, 16777, 40330, 26214, 40330, 26214, 26214, 16777, 26214, 16777,},
-    {  37449, 23302, 37449, 23302, 23302, 14589, 23302, 14589, 37449, 23302, 37449, 23302, 23302, 14589, 23302, 14589,},
-    {  32768, 20972, 32768, 20972, 20972, 13422, 20972, 13422, 32768, 20972, 32768, 20972, 20972, 13422, 20972, 13422,},
-    {  29127, 18236, 29127, 18236, 18236, 11570, 18236, 11570, 29127, 18236, 29127, 18236, 18236, 11570, 18236, 11570,},
-    {  26214, 16132, 26214, 16132, 16132, 10486, 16132, 10486, 26214, 16132, 26214, 16132, 16132, 10486, 16132, 10486,},
-    {  23831, 14980, 23831, 14980, 14980,  9321, 14980,  9321, 23831, 14980, 23831, 14980, 14980,  9321, 14980,  9321,},
-    {  20165, 13107, 20165, 13107, 13107,  8389, 13107,  8389, 20165, 13107, 20165, 13107, 13107,  8389, 13107,  8389,},
-    {  18725, 11651, 18725, 11651, 11651,  7294, 11651,  7294, 18725, 11651, 18725, 11651, 11651,  7294, 11651,  7294,},
-    {  16384, 10486, 16384, 10486, 10486,  6711, 10486,  6711, 16384, 10486, 16384, 10486, 10486,  6711, 10486,  6711,},
-    {  14564,  9118, 14564,  9118,  9118,  5785,  9118,  5785, 14564,  9118, 14564,  9118,  9118,  5785,  9118,  5785,},
-    {  13107,  8066, 13107,  8066,  8066,  5243,  8066,  5243, 13107,  8066, 13107,  8066,  8066,  5243,  8066,  5243,},
-    {  11916,  7490, 11916,  7490,  7490,  4660,  7490,  4660, 11916,  7490, 11916,  7490,  7490,  4660,  7490,  4660,},
-    {  10082,  6554, 10082,  6554,  6554,  4194,  6554,  4194, 10082,  6554, 10082,  6554,  6554,  4194,  6554,  4194,},
-    {   9362,  5825,  9362,  5825,  5825,  3647,  5825,  3647,  9362,  5825,  9362,  5825,  5825,  3647,  5825,  3647,},
-    {   8192,  5243,  8192,  5243,  5243,  3355,  5243,  3355,  8192,  5243,  8192,  5243,  5243,  3355,  5243,  3355,},
-    {   7282,  4559,  7282,  4559,  4559,  2893,  4559,  2893,  7282,  4559,  7282,  4559,  4559,  2893,  4559,  2893,},
-    {   6554,  4033,  6554,  4033,  4033,  2621,  4033,  2621,  6554,  4033,  6554,  4033,  4033,  2621,  4033,  2621,},
-    {   5958,  3745,  5958,  3745,  3745,  2330,  3745,  2330,  5958,  3745,  5958,  3745,  3745,  2330,  3745,  2330,},
-    {   5041,  3277,  5041,  3277,  3277,  2097,  3277,  2097,  5041,  3277,  5041,  3277,  3277,  2097,  3277,  2097,},
-    {   4681,  2913,  4681,  2913,  2913,  1824,  2913,  1824,  4681,  2913,  4681,  2913,  2913,  1824,  2913,  1824,},
-    {   4096,  2621,  4096,  2621,  2621,  1678,  2621,  1678,  4096,  2621,  4096,  2621,  2621,  1678,  2621,  1678,},
-    {   3641,  2280,  3641,  2280,  2280,  1446,  2280,  1446,  3641,  2280,  3641,  2280,  2280,  1446,  2280,  1446,},
-    {   3277,  2016,  3277,  2016,  2016,  1311,  2016,  1311,  3277,  2016,  3277,  2016,  2016,  1311,  2016,  1311,},
-    {   2979,  1872,  2979,  1872,  1872,  1165,  1872,  1165,  2979,  1872,  2979,  1872,  1872,  1165,  1872,  1165,},
-    {   2521,  1638,  2521,  1638,  1638,  1049,  1638,  1049,  2521,  1638,  2521,  1638,  1638,  1049,  1638,  1049,},
-    {   2341,  1456,  2341,  1456,  1456,   912,  1456,   912,  2341,  1456,  2341,  1456,  1456,   912,  1456,   912,},
-    {   2048,  1311,  2048,  1311,  1311,   839,  1311,   839,  2048,  1311,  2048,  1311,  1311,   839,  1311,   839,},
-    {   1820,  1140,  1820,  1140,  1140,   723,  1140,   723,  1820,  1140,  1820,  1140,  1140,   723,  1140,   723,},
-    {   1638,  1008,  1638,  1008,  1008,   655,  1008,   655,  1638,  1008,  1638,  1008,  1008,   655,  1008,   655,},
-    {   1489,   936,  1489,   936,   936,   583,   936,   583,  1489,   936,  1489,   936,   936,   583,   936,   583,},
-    {   1260,   819,  1260,   819,   819,   524,   819,   524,  1260,   819,  1260,   819,   819,   524,   819,   524,},
-    {   1170,   728,  1170,   728,   728,   456,   728,   456,  1170,   728,  1170,   728,   728,   456,   728,   456,},
-};
-
 
 /* Deblocking filter (p153) */
 static const uint8_t alpha_table[52*3] = {
@@ -624,34 +547,34 @@ static const uint8_t beta_table[52*3] = {
     18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
     18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
 };
-static const uint8_t tc0_table[52*3][3] = {
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 },
-    { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 1 },
-    { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 1, 1, 1 },
-    { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 2 }, { 1, 1, 2 }, { 1, 1, 2 },
-    { 1, 1, 2 }, { 1, 2, 3 }, { 1, 2, 3 }, { 2, 2, 3 }, { 2, 2, 4 }, { 2, 3, 4 },
-    { 2, 3, 4 }, { 3, 3, 5 }, { 3, 4, 6 }, { 3, 4, 6 }, { 4, 5, 7 }, { 4, 5, 8 },
-    { 4, 6, 9 }, { 5, 7,10 }, { 6, 8,11 }, { 6, 8,13 }, { 7,10,14 }, { 8,11,16 },
-    { 9,12,18 }, {10,13,20 }, {11,15,23 }, {13,17,25 },
-    {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 },
-    {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 },
-    {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 },
-    {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 },
-    {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 },
-    {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 },
-    {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 },
-    {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 },
-    {13,17,25 }, {13,17,25 }, {13,17,25 }, {13,17,25 },
+static const uint8_t tc0_table[52*3][4] = {
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
+    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
+    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
+    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
+    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
+    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
+    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
+    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 };
 
 /* Cabac pre state table */
diff --git a/libavcodec/h264enc.c b/libavcodec/h264enc.c
index 2cff901..788e0ae 100644
--- a/libavcodec/h264enc.c
+++ b/libavcodec/h264enc.c
@@ -107,3 +107,154 @@ static uint8_t *h264_write_nal_unit(int nal_ref_idc, int nal_unit_type, uint8_t
     return dest+destpos;
 }
 
+static const uint8_t pict_type_to_golomb[7] = {-1, 2, 0, 1, -1, 4, 3};
+
+static const uint8_t intra4x4_cbp_to_golomb[48] = {
+    3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
+   16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7,  1,
+   41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0
+};
+
+static const uint8_t inter_cbp_to_golomb[48] = {
+    0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,
+    1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
+    6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
+};
+
+#define QUANT_SHIFT 22
+
+static const int quant_coeff[52][16] = {
+    { 419430, 258111, 419430, 258111, 258111, 167772, 258111, 167772, 419430, 258111, 419430, 258111, 258111, 167772, 258111, 167772,},
+    { 381300, 239675, 381300, 239675, 239675, 149131, 239675, 149131, 381300, 239675, 381300, 239675, 239675, 149131, 239675, 149131,},
+    { 322639, 209715, 322639, 209715, 209715, 134218, 209715, 134218, 322639, 209715, 322639, 209715, 209715, 134218, 209715, 134218,},
+    { 299593, 186414, 299593, 186414, 186414, 116711, 186414, 116711, 299593, 186414, 299593, 186414, 186414, 116711, 186414, 116711,},
+    { 262144, 167772, 262144, 167772, 167772, 107374, 167772, 107374, 262144, 167772, 262144, 167772, 167772, 107374, 167772, 107374,},
+    { 233017, 145889, 233017, 145889, 145889,  92564, 145889,  92564, 233017, 145889, 233017, 145889, 145889,  92564, 145889,  92564,},
+    { 209715, 129056, 209715, 129056, 129056,  83886, 129056,  83886, 209715, 129056, 209715, 129056, 129056,  83886, 129056,  83886,},
+    { 190650, 119837, 190650, 119837, 119837,  74565, 119837,  74565, 190650, 119837, 190650, 119837, 119837,  74565, 119837,  74565,},
+    { 161319, 104858, 161319, 104858, 104858,  67109, 104858,  67109, 161319, 104858, 161319, 104858, 104858,  67109, 104858,  67109,},
+    { 149797,  93207, 149797,  93207,  93207,  58356,  93207,  58356, 149797,  93207, 149797,  93207,  93207,  58356,  93207,  58356,},
+    { 131072,  83886, 131072,  83886,  83886,  53687,  83886,  53687, 131072,  83886, 131072,  83886,  83886,  53687,  83886,  53687,},
+    { 116508,  72944, 116508,  72944,  72944,  46282,  72944,  46282, 116508,  72944, 116508,  72944,  72944,  46282,  72944,  46282,},
+    { 104858,  64528, 104858,  64528,  64528,  41943,  64528,  41943, 104858,  64528, 104858,  64528,  64528,  41943,  64528,  41943,},
+    {  95325,  59919,  95325,  59919,  59919,  37283,  59919,  37283,  95325,  59919,  95325,  59919,  59919,  37283,  59919,  37283,},
+    {  80660,  52429,  80660,  52429,  52429,  33554,  52429,  33554,  80660,  52429,  80660,  52429,  52429,  33554,  52429,  33554,},
+    {  74898,  46603,  74898,  46603,  46603,  29178,  46603,  29178,  74898,  46603,  74898,  46603,  46603,  29178,  46603,  29178,},
+    {  65536,  41943,  65536,  41943,  41943,  26844,  41943,  26844,  65536,  41943,  65536,  41943,  41943,  26844,  41943,  26844,},
+    {  58254,  36472,  58254,  36472,  36472,  23141,  36472,  23141,  58254,  36472,  58254,  36472,  36472,  23141,  36472,  23141,},
+    {  52429,  32264,  52429,  32264,  32264,  20972,  32264,  20972,  52429,  32264,  52429,  32264,  32264,  20972,  32264,  20972,},
+    {  47663,  29959,  47663,  29959,  29959,  18641,  29959,  18641,  47663,  29959,  47663,  29959,  29959,  18641,  29959,  18641,},
+    {  40330,  26214,  40330,  26214,  26214,  16777,  26214,  16777,  40330,  26214,  40330,  26214,  26214,  16777,  26214,  16777,},
+    {  37449,  23302,  37449,  23302,  23302,  14589,  23302,  14589,  37449,  23302,  37449,  23302,  23302,  14589,  23302,  14589,},
+    {  32768,  20972,  32768,  20972,  20972,  13422,  20972,  13422,  32768,  20972,  32768,  20972,  20972,  13422,  20972,  13422,},
+    {  29127,  18236,  29127,  18236,  18236,  11570,  18236,  11570,  29127,  18236,  29127,  18236,  18236,  11570,  18236,  11570,},
+    {  26214,  16132,  26214,  16132,  16132,  10486,  16132,  10486,  26214,  16132,  26214,  16132,  16132,  10486,  16132,  10486,},
+    {  23831,  14980,  23831,  14980,  14980,   9321,  14980,   9321,  23831,  14980,  23831,  14980,  14980,   9321,  14980,   9321,},
+    {  20165,  13107,  20165,  13107,  13107,   8389,  13107,   8389,  20165,  13107,  20165,  13107,  13107,   8389,  13107,   8389,},
+    {  18725,  11651,  18725,  11651,  11651,   7294,  11651,   7294,  18725,  11651,  18725,  11651,  11651,   7294,  11651,   7294,},
+    {  16384,  10486,  16384,  10486,  10486,   6711,  10486,   6711,  16384,  10486,  16384,  10486,  10486,   6711,  10486,   6711,},
+    {  14564,   9118,  14564,   9118,   9118,   5785,   9118,   5785,  14564,   9118,  14564,   9118,   9118,   5785,   9118,   5785,},
+    {  13107,   8066,  13107,   8066,   8066,   5243,   8066,   5243,  13107,   8066,  13107,   8066,   8066,   5243,   8066,   5243,},
+    {  11916,   7490,  11916,   7490,   7490,   4660,   7490,   4660,  11916,   7490,  11916,   7490,   7490,   4660,   7490,   4660,},
+    {  10082,   6554,  10082,   6554,   6554,   4194,   6554,   4194,  10082,   6554,  10082,   6554,   6554,   4194,   6554,   4194,},
+    {   9362,   5825,   9362,   5825,   5825,   3647,   5825,   3647,   9362,   5825,   9362,   5825,   5825,   3647,   5825,   3647,},
+    {   8192,   5243,   8192,   5243,   5243,   3355,   5243,   3355,   8192,   5243,   8192,   5243,   5243,   3355,   5243,   3355,},
+    {   7282,   4559,   7282,   4559,   4559,   2893,   4559,   2893,   7282,   4559,   7282,   4559,   4559,   2893,   4559,   2893,},
+    {   6554,   4033,   6554,   4033,   4033,   2621,   4033,   2621,   6554,   4033,   6554,   4033,   4033,   2621,   4033,   2621,},
+    {   5958,   3745,   5958,   3745,   3745,   2330,   3745,   2330,   5958,   3745,   5958,   3745,   3745,   2330,   3745,   2330,},
+    {   5041,   3277,   5041,   3277,   3277,   2097,   3277,   2097,   5041,   3277,   5041,   3277,   3277,   2097,   3277,   2097,},
+    {   4681,   2913,   4681,   2913,   2913,   1824,   2913,   1824,   4681,   2913,   4681,   2913,   2913,   1824,   2913,   1824,},
+    {   4096,   2621,   4096,   2621,   2621,   1678,   2621,   1678,   4096,   2621,   4096,   2621,   2621,   1678,   2621,   1678,},
+    {   3641,   2280,   3641,   2280,   2280,   1446,   2280,   1446,   3641,   2280,   3641,   2280,   2280,   1446,   2280,   1446,},
+    {   3277,   2016,   3277,   2016,   2016,   1311,   2016,   1311,   3277,   2016,   3277,   2016,   2016,   1311,   2016,   1311,},
+    {   2979,   1872,   2979,   1872,   1872,   1165,   1872,   1165,   2979,   1872,   2979,   1872,   1872,   1165,   1872,   1165,},
+    {   2521,   1638,   2521,   1638,   1638,   1049,   1638,   1049,   2521,   1638,   2521,   1638,   1638,   1049,   1638,   1049,},
+    {   2341,   1456,   2341,   1456,   1456,    912,   1456,    912,   2341,   1456,   2341,   1456,   1456,    912,   1456,    912,},
+    {   2048,   1311,   2048,   1311,   1311,    839,   1311,    839,   2048,   1311,   2048,   1311,   1311,    839,   1311,    839,},
+    {   1820,   1140,   1820,   1140,   1140,    723,   1140,    723,   1820,   1140,   1820,   1140,   1140,    723,   1140,    723,},
+    {   1638,   1008,   1638,   1008,   1008,    655,   1008,    655,   1638,   1008,   1638,   1008,   1008,    655,   1008,    655,},
+    {   1489,    936,   1489,    936,    936,    583,    936,    583,   1489,    936,   1489,    936,    936,    583,    936,    583,},
+    {   1260,    819,   1260,    819,    819,    524,    819,    524,   1260,    819,   1260,    819,    819,    524,    819,    524,},
+    {   1170,    728,   1170,    728,    728,    456,    728,    456,   1170,    728,   1170,    728,    728,    456,    728,    456,},
+};
+
+//FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
+//FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
+static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale,
+                             int intra, int separate_dc)
+{
+    int i;
+    const int * const quant_3Btable = quant_coeff[qscale];
+    const int bias = intra ? (1 << QUANT_SHIFT) / 3 : (1 << QUANT_SHIFT) / 6;
+    const unsigned int threshold1 = (1 << QUANT_SHIFT) - bias - 1;
+    const unsigned int threshold2 = (threshold1 << 1);
+    int last_non_zero;
+
+    if (separate_dc) {
+        if (qscale <= 18) {
+            //avoid overflows
+            const int dc_bias = intra ? (1 << (QUANT_SHIFT - 2)) / 3 : (1 << (QUANT_SHIFT - 2)) / 6;
+            const unsigned int dc_threshold1 = (1 << (QUANT_SHIFT - 2)) - dc_bias - 1;
+            const unsigned int dc_threshold2 = (dc_threshold1 << 1);
+
+            int level = block[0]*quant_coeff[qscale+18][0];
+            if (((unsigned)(level + dc_threshold1)) > dc_threshold2) {
+                if (level > 0) {
+                    level = (dc_bias + level) >> (QUANT_SHIFT - 2);
+                    block[0] = level;
+                } else {
+                    level = (dc_bias - level) >> (QUANT_SHIFT - 2);
+                    block[0] = -level;
+                }
+//                last_non_zero = i;
+            } else {
+                block[0] = 0;
+            }
+        } else {
+            const int dc_bias = intra ? (1 << (QUANT_SHIFT + 1)) / 3 : (1 << (QUANT_SHIFT + 1)) / 6;
+            const unsigned int dc_threshold1 = (1 << (QUANT_SHIFT + 1)) - dc_bias - 1;
+            const unsigned int dc_threshold2 = (dc_threshold1 << 1);
+
+            int level = block[0]*quant_table[0];
+            if (((unsigned)(level + dc_threshold1)) > dc_threshold2) {
+                if (level > 0) {
+                    level = (dc_bias + level) >> (QUANT_SHIFT + 1);
+                    block[0] = level;
+                } else {
+                    level = (dc_bias - level) >> (QUANT_SHIFT + 1);
+                    block[0] = -level;
+                }
+//                last_non_zero = i;
+            } else {
+                block[0] = 0;
+            }
+        }
+        last_non_zero = 0;
+        i = 1;
+    } else {
+        last_non_zero = -1;
+        i = 0;
+    }
+
+    for (; i < 16; i++) {
+        const int j = scantable[i];
+        int level = block[j]*quant_table[j];
+
+//        if (   bias+level >= (1 << (QMAT_SHIFT - 3))
+//            || bias-level >= (1 << (QMAT_SHIFT - 3))) {
+        if (((unsigned)(level + threshold1)) > threshold2) {
+            if (level > 0) {
+                level = (bias + level) >> QUANT_SHIFT;
+                block[j] = level;
+            } else {
+                level = (bias - level) >> QUANT_SHIFT;
+                block[j] = -level;
+            }
+            last_non_zero = i;
+        } else {
+            block[j] = 0;
+        }
+    }
+
+    return last_non_zero;
+}
diff --git a/libavcodec/h264idct.c b/libavcodec/h264idct.c
index 571e2e9..57d1b75 100644
--- a/libavcodec/h264idct.c
+++ b/libavcodec/h264idct.c
@@ -72,58 +72,57 @@ void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block){
 
 void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
     int i;
-    DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 
     block[0] += 32;
 
     for( i = 0; i < 8; i++ )
     {
-        const int a0 =  src[i][0] + src[i][4];
-        const int a2 =  src[i][0] - src[i][4];
-        const int a4 = (src[i][2]>>1) - src[i][6];
-        const int a6 = (src[i][6]>>1) + src[i][2];
+        const int a0 =  block[0+i*8] + block[4+i*8];
+        const int a2 =  block[0+i*8] - block[4+i*8];
+        const int a4 = (block[2+i*8]>>1) - block[6+i*8];
+        const int a6 = (block[6+i*8]>>1) + block[2+i*8];
 
         const int b0 = a0 + a6;
         const int b2 = a2 + a4;
         const int b4 = a2 - a4;
         const int b6 = a0 - a6;
 
-        const int a1 = -src[i][3] + src[i][5] - src[i][7] - (src[i][7]>>1);
-        const int a3 =  src[i][1] + src[i][7] - src[i][3] - (src[i][3]>>1);
-        const int a5 = -src[i][1] + src[i][7] + src[i][5] + (src[i][5]>>1);
-        const int a7 =  src[i][3] + src[i][5] + src[i][1] + (src[i][1]>>1);
+        const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
+        const int a3 =  block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
+        const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
+        const int a7 =  block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
 
         const int b1 = (a7>>2) + a1;
         const int b3 =  a3 + (a5>>2);
         const int b5 = (a3>>2) - a5;
         const int b7 =  a7 - (a1>>2);
 
-        src[i][0] = b0 + b7;
-        src[i][7] = b0 - b7;
-        src[i][1] = b2 + b5;
-        src[i][6] = b2 - b5;
-        src[i][2] = b4 + b3;
-        src[i][5] = b4 - b3;
-        src[i][3] = b6 + b1;
-        src[i][4] = b6 - b1;
+        block[0+i*8] = b0 + b7;
+        block[7+i*8] = b0 - b7;
+        block[1+i*8] = b2 + b5;
+        block[6+i*8] = b2 - b5;
+        block[2+i*8] = b4 + b3;
+        block[5+i*8] = b4 - b3;
+        block[3+i*8] = b6 + b1;
+        block[4+i*8] = b6 - b1;
     }
     for( i = 0; i < 8; i++ )
     {
-        const int a0 =  src[0][i] + src[4][i];
-        const int a2 =  src[0][i] - src[4][i];
-        const int a4 = (src[2][i]>>1) - src[6][i];
-        const int a6 = (src[6][i]>>1) + src[2][i];
+        const int a0 =  block[i+0*8] + block[i+4*8];
+        const int a2 =  block[i+0*8] - block[i+4*8];
+        const int a4 = (block[i+2*8]>>1) - block[i+6*8];
+        const int a6 = (block[i+6*8]>>1) + block[i+2*8];
 
         const int b0 = a0 + a6;
         const int b2 = a2 + a4;
         const int b4 = a2 - a4;
         const int b6 = a0 - a6;
 
-        const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1);
-        const int a3 =  src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1);
-        const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1);
-        const int a7 =  src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1);
+        const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
+        const int a3 =  block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
+        const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
+        const int a7 =  block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
 
         const int b1 = (a7>>2) + a1;
         const int b3 =  a3 + (a5>>2);
@@ -165,3 +164,55 @@ void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
         dst += stride;
     }
 }
+
+//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
+static const uint8_t scan8[16 + 2*4]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+};
+
+void ff_h264_idct_add16_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride);
+            else                      idct_internal        (dst + block_offset[i], block + i*16, stride, 4, 6, 1);
+        }
+    }
+}
+
+void ff_h264_idct_add16intra_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ]) idct_internal        (dst + block_offset[i], block + i*16, stride, 4, 6, 1);
+        else if(block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+void ff_h264_idct8_add4_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_c(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct8_add_c   (dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct_add_c   (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+        else if(block[i*16])
+            ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+    }
+}
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index 0b7394a..3282ff4 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -198,28 +198,6 @@ static void pred4x4_down_left_rv40_c(uint8_t *src, uint8_t *topright, int stride
     src[3+3*stride]=(t6 + t7 + 1 + l6 + l7 + 1)>>2;
 }
 
-static void pred4x4_down_left_rv40_notop_c(uint8_t *src, uint8_t *topright, int stride){
-    LOAD_LEFT_EDGE
-    LOAD_DOWN_LEFT_EDGE
-
-    src[0+0*stride]=(l0 + l2 + 2*l1 + 2)>>2;
-    src[1+0*stride]=
-    src[0+1*stride]=(l1 + l3 + 2*l2 + 2)>>2;
-    src[2+0*stride]=
-    src[1+1*stride]=
-    src[0+2*stride]=(l2 + l4 + 2*l3 + 2)>>2;
-    src[3+0*stride]=
-    src[2+1*stride]=
-    src[1+2*stride]=
-    src[0+3*stride]=(l3 + l5 + 2*l4 + 2)>>2;
-    src[3+1*stride]=
-    src[2+2*stride]=
-    src[1+3*stride]=(l4 + l6 + 2*l5 + 2)>>2;
-    src[3+2*stride]=
-    src[2+3*stride]=(l5 + l7 + 2*l6 + 2)>>2;
-    src[3+3*stride]=(l6 + l7 + 1)>>1;
-}
-
 static void pred4x4_down_left_rv40_nodown_c(uint8_t *src, uint8_t *topright, int stride){
     LOAD_TOP_EDGE
     LOAD_TOP_RIGHT_EDGE
@@ -1009,6 +987,93 @@ static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topr
 #undef PL
 #undef SRC
 
+static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
+    int i;
+    pix -= stride;
+    for(i=0; i<4; i++){
+        uint8_t v = pix[0];
+        pix[1*stride]= v += block[0];
+        pix[2*stride]= v += block[4];
+        pix[3*stride]= v += block[8];
+        pix[4*stride]= v += block[12];
+        pix++;
+        block++;
+    }
+}
+
+static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++){
+        uint8_t v = pix[-1];
+        pix[0]= v += block[0];
+        pix[1]= v += block[1];
+        pix[2]= v += block[2];
+        pix[3]= v += block[3];
+        pix+= stride;
+        block+= 4;
+    }
+}
+
+static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
+    int i;
+    pix -= stride;
+    for(i=0; i<8; i++){
+        uint8_t v = pix[0];
+        pix[1*stride]= v += block[0];
+        pix[2*stride]= v += block[8];
+        pix[3*stride]= v += block[16];
+        pix[4*stride]= v += block[24];
+        pix[5*stride]= v += block[32];
+        pix[6*stride]= v += block[40];
+        pix[7*stride]= v += block[48];
+        pix[8*stride]= v += block[56];
+        pix++;
+        block++;
+    }
+}
+
+static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<8; i++){
+        uint8_t v = pix[-1];
+        pix[0]= v += block[0];
+        pix[1]= v += block[1];
+        pix[2]= v += block[2];
+        pix[3]= v += block[3];
+        pix[4]= v += block[4];
+        pix[5]= v += block[5];
+        pix[6]= v += block[6];
+        pix[7]= v += block[7];
+        pix+= stride;
+        block+= 8;
+    }
+}
+
+static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<16; i++)
+        pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
+}
+
+static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<16; i++)
+        pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
+}
+
+static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
+}
+
+static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
+}
+
+
 /**
  * Sets the intra prediction function pointers.
  */
@@ -1097,4 +1162,14 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id){
     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
     h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
+
+    //special lossless h/v prediction for h264
+    h->pred4x4_add  [VERT_PRED   ]= pred4x4_vertical_add_c;
+    h->pred4x4_add  [ HOR_PRED   ]= pred4x4_horizontal_add_c;
+    h->pred8x8l_add [VERT_PRED   ]= pred8x8l_vertical_add_c;
+    h->pred8x8l_add [ HOR_PRED   ]= pred8x8l_horizontal_add_c;
+    h->pred8x8_add  [VERT_PRED8x8]= pred8x8_vertical_add_c;
+    h->pred8x8_add  [ HOR_PRED8x8]= pred8x8_horizontal_add_c;
+    h->pred16x16_add[VERT_PRED8x8]= pred16x16_vertical_add_c;
+    h->pred16x16_add[ HOR_PRED8x8]= pred16x16_horizontal_add_c;
 }
diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h
index 150567d..d87df2a 100644
--- a/libavcodec/h264pred.h
+++ b/libavcodec/h264pred.h
@@ -29,6 +29,7 @@
 #define AVCODEC_H264PRED_H
 
 #include "libavutil/common.h"
+#include "dsputil.h"
 
 /**
  * Prediction types
@@ -75,6 +76,11 @@ typedef struct H264PredContext{
     void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
     void (*pred8x8  [4+3+4])(uint8_t *src, int stride);
     void (*pred16x16[4+3])(uint8_t *src, int stride);
+
+    void (*pred4x4_add  [2])(uint8_t *pix/*align  4*/, const DCTELEM *block/*align 16*/, int stride);
+    void (*pred8x8l_add [2])(uint8_t *pix/*align  8*/, const DCTELEM *block/*align 16*/, int stride);
+    void (*pred8x8_add  [3])(uint8_t *pix/*align  8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
+    void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
 }H264PredContext;
 
 void ff_h264_pred_init(H264PredContext *h, int codec_id);
diff --git a/libavcodec/huffman.c b/libavcodec/huffman.c
index d41dabb..f21a4f8 100644
--- a/libavcodec/huffman.c
+++ b/libavcodec/huffman.c
@@ -67,7 +67,7 @@ static int build_huff_tree(VLC *vlc, Node *nodes, int head, int flags)
  * first nb_codes nodes.count must be set
  */
 int ff_huff_build_tree(AVCodecContext *avctx, VLC *vlc, int nb_codes,
-                       Node *nodes, huff_cmp_t cmp, int flags)
+                       Node *nodes, HuffCmp cmp, int flags)
 {
     int i, j;
     int cur_node;
diff --git a/libavcodec/huffman.h b/libavcodec/huffman.h
index 0b11af8..bfb7723 100644
--- a/libavcodec/huffman.h
+++ b/libavcodec/huffman.h
@@ -35,8 +35,8 @@ typedef struct {
 #define FF_HUFFMAN_FLAG_HNODE_FIRST 0x01
 #define FF_HUFFMAN_FLAG_ZERO_COUNT  0x02
 
-typedef int (*huff_cmp_t)(const void *va, const void *vb);
+typedef int (*HuffCmp)(const void *va, const void *vb);
 int ff_huff_build_tree(AVCodecContext *avctx, VLC *vlc, int nb_codes,
-                       Node *nodes, huff_cmp_t cmp, int flags);
+                       Node *nodes, HuffCmp cmp, int flags);
 
 #endif /* AVCODEC_HUFFMAN_H */
diff --git a/libavcodec/huffyuv.c b/libavcodec/huffyuv.c
index cf90adc..bf13e44 100644
--- a/libavcodec/huffyuv.c
+++ b/libavcodec/huffyuv.c
@@ -265,16 +265,16 @@ static int generate_bits_table(uint32_t *dst, uint8_t *len_table){
 typedef struct {
     uint64_t val;
     int name;
-} heap_elem_t;
+} HeapElem;
 
-static void heap_sift(heap_elem_t *h, int root, int size)
+static void heap_sift(HeapElem *h, int root, int size)
 {
     while(root*2+1 < size) {
         int child = root*2+1;
         if(child < size-1 && h[child].val > h[child+1].val)
             child++;
         if(h[root].val > h[child].val) {
-            FFSWAP(heap_elem_t, h[root], h[child]);
+            FFSWAP(HeapElem, h[root], h[child]);
             root = child;
         } else
             break;
@@ -282,7 +282,7 @@ static void heap_sift(heap_elem_t *h, int root, int size)
 }
 
 static void generate_len_table(uint8_t *dst, uint64_t *stats, int size){
-    heap_elem_t h[size];
+    HeapElem h[size];
     int up[2*size];
     int len[2*size];
     int offset, i, next;
diff --git a/libavcodec/i386/cavsdsp_mmx.c b/libavcodec/i386/cavsdsp_mmx.c
deleted file mode 100644
index 15cafd6..0000000
--- a/libavcodec/i386/cavsdsp_mmx.c
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
- * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer at gmx.de>
- *
- * MMX-optimized DSP functions, based on H.264 optimizations by
- * Michael Niedermayer and Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/common.h"
-#include "libavutil/x86_cpu.h"
-#include "libavcodec/dsputil.h"
-#include "dsputil_mmx.h"
-
-/*****************************************************************************
- *
- * inverse transform
- *
- ****************************************************************************/
-
-static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
-{
-    __asm__ volatile(
-        "movq 112(%0), %%mm4  \n\t" /* mm4 = src7 */
-        "movq  16(%0), %%mm5  \n\t" /* mm5 = src1 */
-        "movq  80(%0), %%mm2  \n\t" /* mm2 = src5 */
-        "movq  48(%0), %%mm7  \n\t" /* mm7 = src3 */
-        "movq   %%mm4, %%mm0  \n\t"
-        "movq   %%mm5, %%mm3  \n\t"
-        "movq   %%mm2, %%mm6  \n\t"
-        "movq   %%mm7, %%mm1  \n\t"
-
-        "paddw  %%mm4, %%mm4  \n\t" /* mm4 = 2*src7 */
-        "paddw  %%mm3, %%mm3  \n\t" /* mm3 = 2*src1 */
-        "paddw  %%mm6, %%mm6  \n\t" /* mm6 = 2*src5 */
-        "paddw  %%mm1, %%mm1  \n\t" /* mm1 = 2*src3 */
-        "paddw  %%mm4, %%mm0  \n\t" /* mm0 = 3*src7 */
-        "paddw  %%mm3, %%mm5  \n\t" /* mm5 = 3*src1 */
-        "paddw  %%mm6, %%mm2  \n\t" /* mm2 = 3*src5 */
-        "paddw  %%mm1, %%mm7  \n\t" /* mm7 = 3*src3 */
-        "psubw  %%mm4, %%mm5  \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
-        "paddw  %%mm6, %%mm7  \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
-        "psubw  %%mm2, %%mm1  \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
-        "paddw  %%mm0, %%mm3  \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
-
-        "movq   %%mm5, %%mm4  \n\t"
-        "movq   %%mm7, %%mm6  \n\t"
-        "movq   %%mm3, %%mm0  \n\t"
-        "movq   %%mm1, %%mm2  \n\t"
-        SUMSUB_BA( %%mm7, %%mm5 )   /* mm7 = a0 + a1  mm5 = a0 - a1 */
-        "paddw  %%mm3, %%mm7  \n\t" /* mm7 = a0 + a1 + a3 */
-        "paddw  %%mm1, %%mm5  \n\t" /* mm5 = a0 - a1 + a2 */
-        "paddw  %%mm7, %%mm7  \n\t"
-        "paddw  %%mm5, %%mm5  \n\t"
-        "paddw  %%mm6, %%mm7  \n\t" /* mm7 = b4 */
-        "paddw  %%mm4, %%mm5  \n\t" /* mm5 = b5 */
-
-        SUMSUB_BA( %%mm1, %%mm3 )   /* mm1 = a3 + a2  mm3 = a3 - a2 */
-        "psubw  %%mm1, %%mm4  \n\t" /* mm4 = a0 - a2 - a3 */
-        "movq   %%mm4, %%mm1  \n\t" /* mm1 = a0 - a2 - a3 */
-        "psubw  %%mm6, %%mm3  \n\t" /* mm3 = a3 - a2 - a1 */
-        "paddw  %%mm1, %%mm1  \n\t"
-        "paddw  %%mm3, %%mm3  \n\t"
-        "psubw  %%mm2, %%mm1  \n\t" /* mm1 = b7 */
-        "paddw  %%mm0, %%mm3  \n\t" /* mm3 = b6 */
-
-        "movq  32(%0), %%mm2  \n\t" /* mm2 = src2 */
-        "movq  96(%0), %%mm6  \n\t" /* mm6 = src6 */
-        "movq   %%mm2, %%mm4  \n\t"
-        "movq   %%mm6, %%mm0  \n\t"
-        "psllw  $2,    %%mm4  \n\t" /* mm4 = 4*src2 */
-        "psllw  $2,    %%mm6  \n\t" /* mm6 = 4*src6 */
-        "paddw  %%mm4, %%mm2  \n\t" /* mm2 = 5*src2 */
-        "paddw  %%mm6, %%mm0  \n\t" /* mm0 = 5*src6 */
-        "paddw  %%mm2, %%mm2  \n\t"
-        "paddw  %%mm0, %%mm0  \n\t"
-        "psubw  %%mm0, %%mm4  \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
-        "paddw  %%mm2, %%mm6  \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
-
-        "movq    (%0), %%mm2  \n\t" /* mm2 = src0 */
-        "movq  64(%0), %%mm0  \n\t" /* mm0 = src4 */
-        SUMSUB_BA( %%mm0, %%mm2 )   /* mm0 = src0+src4  mm2 = src0-src4 */
-        "psllw  $3,    %%mm0  \n\t"
-        "psllw  $3,    %%mm2  \n\t"
-        "paddw  %1,    %%mm0  \n\t" /* add rounding bias */
-        "paddw  %1,    %%mm2  \n\t" /* add rounding bias */
-
-        SUMSUB_BA( %%mm6, %%mm0 )   /* mm6 = a4 + a6  mm0 = a4 - a6 */
-        SUMSUB_BA( %%mm4, %%mm2 )   /* mm4 = a5 + a7  mm2 = a5 - a7 */
-        SUMSUB_BA( %%mm7, %%mm6 )   /* mm7 = dst0  mm6 = dst7 */
-        SUMSUB_BA( %%mm5, %%mm4 )   /* mm5 = dst1  mm4 = dst6 */
-        SUMSUB_BA( %%mm3, %%mm2 )   /* mm3 = dst2  mm2 = dst5 */
-        SUMSUB_BA( %%mm1, %%mm0 )   /* mm1 = dst3  mm0 = dst4 */
-        :: "r"(block), "m"(bias)
-    );
-}
-
-static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
-{
-    int i;
-    DECLARE_ALIGNED_8(int16_t, b2[64]);
-
-    for(i=0; i<2; i++){
-        DECLARE_ALIGNED_8(uint64_t, tmp);
-
-        cavs_idct8_1d(block+4*i, ff_pw_4);
-
-        __asm__ volatile(
-            "psraw     $3, %%mm7  \n\t"
-            "psraw     $3, %%mm6  \n\t"
-            "psraw     $3, %%mm5  \n\t"
-            "psraw     $3, %%mm4  \n\t"
-            "psraw     $3, %%mm3  \n\t"
-            "psraw     $3, %%mm2  \n\t"
-            "psraw     $3, %%mm1  \n\t"
-            "psraw     $3, %%mm0  \n\t"
-            "movq   %%mm7,    %0   \n\t"
-            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
-            "movq   %%mm0,  8(%1)  \n\t"
-            "movq   %%mm6, 24(%1)  \n\t"
-            "movq   %%mm7, 40(%1)  \n\t"
-            "movq   %%mm4, 56(%1)  \n\t"
-            "movq    %0,    %%mm7  \n\t"
-            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
-            "movq   %%mm7,   (%1)  \n\t"
-            "movq   %%mm1, 16(%1)  \n\t"
-            "movq   %%mm0, 32(%1)  \n\t"
-            "movq   %%mm3, 48(%1)  \n\t"
-            : "=m"(tmp)
-            : "r"(b2+32*i)
-            : "memory"
-        );
-    }
-
-    for(i=0; i<2; i++){
-        cavs_idct8_1d(b2+4*i, ff_pw_64);
-
-        __asm__ volatile(
-            "psraw     $7, %%mm7  \n\t"
-            "psraw     $7, %%mm6  \n\t"
-            "psraw     $7, %%mm5  \n\t"
-            "psraw     $7, %%mm4  \n\t"
-            "psraw     $7, %%mm3  \n\t"
-            "psraw     $7, %%mm2  \n\t"
-            "psraw     $7, %%mm1  \n\t"
-            "psraw     $7, %%mm0  \n\t"
-            "movq   %%mm7,    (%0)  \n\t"
-            "movq   %%mm5,  16(%0)  \n\t"
-            "movq   %%mm3,  32(%0)  \n\t"
-            "movq   %%mm1,  48(%0)  \n\t"
-            "movq   %%mm0,  64(%0)  \n\t"
-            "movq   %%mm2,  80(%0)  \n\t"
-            "movq   %%mm4,  96(%0)  \n\t"
-            "movq   %%mm6, 112(%0)  \n\t"
-            :: "r"(b2+4*i)
-            : "memory"
-        );
-    }
-
-    add_pixels_clamped_mmx(b2, dst, stride);
-
-    /* clear block */
-    __asm__ volatile(
-            "pxor %%mm7, %%mm7   \n\t"
-            "movq %%mm7, (%0)    \n\t"
-            "movq %%mm7, 8(%0)   \n\t"
-            "movq %%mm7, 16(%0)  \n\t"
-            "movq %%mm7, 24(%0)  \n\t"
-            "movq %%mm7, 32(%0)  \n\t"
-            "movq %%mm7, 40(%0)  \n\t"
-            "movq %%mm7, 48(%0)  \n\t"
-            "movq %%mm7, 56(%0)  \n\t"
-            "movq %%mm7, 64(%0)  \n\t"
-            "movq %%mm7, 72(%0)  \n\t"
-            "movq %%mm7, 80(%0)  \n\t"
-            "movq %%mm7, 88(%0)  \n\t"
-            "movq %%mm7, 96(%0)  \n\t"
-            "movq %%mm7, 104(%0) \n\t"
-            "movq %%mm7, 112(%0) \n\t"
-            "movq %%mm7, 120(%0) \n\t"
-            :: "r" (block)
-    );
-}
-
-/*****************************************************************************
- *
- * motion compensation
- *
- ****************************************************************************/
-
-/* vertical filter [-1 -2 96 42 -7  0]  */
-#define QPEL_CAVSV1(A,B,C,D,E,F,OP)      \
-        "movd (%0), "#F"            \n\t"\
-        "movq "#C", %%mm6           \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
-        "movq "#D", %%mm7           \n\t"\
-        "pmullw %6, %%mm7           \n\t"\
-        "psllw $3, "#E"             \n\t"\
-        "psubw "#E", %%mm6          \n\t"\
-        "psraw $3, "#E"             \n\t"\
-        "paddw %%mm7, %%mm6         \n\t"\
-        "paddw "#E", %%mm6          \n\t"\
-        "paddw "#B", "#B"           \n\t"\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "add %2, %0                 \n\t"\
-        "punpcklbw %%mm7, "#F"      \n\t"\
-        "psubw "#B", %%mm6          \n\t"\
-        "psraw $1, "#B"             \n\t"\
-        "psubw "#A", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
-        "psraw $7, %%mm6            \n\t"\
-        "packuswb %%mm6, %%mm6      \n\t"\
-        OP(%%mm6, (%1), A, d)            \
-        "add %3, %1                 \n\t"
-
-/* vertical filter [ 0 -1  5  5 -1  0]  */
-#define QPEL_CAVSV2(A,B,C,D,E,F,OP)      \
-        "movd (%0), "#F"            \n\t"\
-        "movq "#C", %%mm6           \n\t"\
-        "paddw "#D", %%mm6          \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
-        "add %2, %0                 \n\t"\
-        "punpcklbw %%mm7, "#F"      \n\t"\
-        "psubw "#B", %%mm6          \n\t"\
-        "psubw "#E", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
-        "psraw $3, %%mm6            \n\t"\
-        "packuswb %%mm6, %%mm6      \n\t"\
-        OP(%%mm6, (%1), A, d)            \
-        "add %3, %1                 \n\t"
-
-/* vertical filter [ 0 -7 42 96 -2 -1]  */
-#define QPEL_CAVSV3(A,B,C,D,E,F,OP)      \
-        "movd (%0), "#F"            \n\t"\
-        "movq "#C", %%mm6           \n\t"\
-        "pmullw %6, %%mm6           \n\t"\
-        "movq "#D", %%mm7           \n\t"\
-        "pmullw %5, %%mm7           \n\t"\
-        "psllw $3, "#B"             \n\t"\
-        "psubw "#B", %%mm6          \n\t"\
-        "psraw $3, "#B"             \n\t"\
-        "paddw %%mm7, %%mm6         \n\t"\
-        "paddw "#B", %%mm6          \n\t"\
-        "paddw "#E", "#E"           \n\t"\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "add %2, %0                 \n\t"\
-        "punpcklbw %%mm7, "#F"      \n\t"\
-        "psubw "#E", %%mm6          \n\t"\
-        "psraw $1, "#E"             \n\t"\
-        "psubw "#F", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
-        "psraw $7, %%mm6            \n\t"\
-        "packuswb %%mm6, %%mm6      \n\t"\
-        OP(%%mm6, (%1), A, d)            \
-        "add %3, %1                 \n\t"
-
-
-#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
-    int w= 2;\
-    src -= 2*srcStride;\
-    \
-    while(w--){\
-      __asm__ volatile(\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "movd (%0), %%mm0           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm1           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm2           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm3           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm4           \n\t"\
-        "add %2, %0                 \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "punpcklbw %%mm7, %%mm1     \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpcklbw %%mm7, %%mm3     \n\t"\
-        "punpcklbw %%mm7, %%mm4     \n\t"\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
-        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
-        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
-        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
-        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
-        \
-        : "+a"(src), "+c"(dst)\
-        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\
-        : "memory"\
-     );\
-     if(h==16){\
-        __asm__ volatile(\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
-            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
-            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
-            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
-            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
-            \
-           : "+a"(src), "+c"(dst)\
-           : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ADD),  "m"(MUL1), "m"(MUL2)\
-           : "memory"\
-        );\
-     }\
-     src += 4-(h+5)*srcStride;\
-     dst += 4-h*dstStride;\
-   }
-
-#define QPEL_CAVS(OPNAME, OP, MMX)\
-static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    int h=8;\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "movq %5, %%mm6             \n\t"\
-        "1:                         \n\t"\
-        "movq    (%0), %%mm0        \n\t"\
-        "movq   1(%0), %%mm2        \n\t"\
-        "movq %%mm0, %%mm1          \n\t"\
-        "movq %%mm2, %%mm3          \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "punpckhbw %%mm7, %%mm1     \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpckhbw %%mm7, %%mm3     \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "paddw %%mm3, %%mm1         \n\t"\
-        "pmullw %%mm6, %%mm0        \n\t"\
-        "pmullw %%mm6, %%mm1        \n\t"\
-        "movq   -1(%0), %%mm2       \n\t"\
-        "movq    2(%0), %%mm4       \n\t"\
-        "movq %%mm2, %%mm3          \n\t"\
-        "movq %%mm4, %%mm5          \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpckhbw %%mm7, %%mm3     \n\t"\
-        "punpcklbw %%mm7, %%mm4     \n\t"\
-        "punpckhbw %%mm7, %%mm5     \n\t"\
-        "paddw %%mm4, %%mm2         \n\t"\
-        "paddw %%mm3, %%mm5         \n\t"\
-        "psubw %%mm2, %%mm0         \n\t"\
-        "psubw %%mm5, %%mm1         \n\t"\
-        "movq %6, %%mm5             \n\t"\
-        "paddw %%mm5, %%mm0         \n\t"\
-        "paddw %%mm5, %%mm1         \n\t"\
-        "psraw $3, %%mm0            \n\t"\
-        "psraw $3, %%mm1            \n\t"\
-        "packuswb %%mm1, %%mm0      \n\t"\
-        OP(%%mm0, (%1),%%mm5, q)         \
-        "add %3, %0                 \n\t"\
-        "add %4, %1                 \n\t"\
-        "decl %2                    \n\t"\
-        " jnz 1b                    \n\t"\
-        : "+a"(src), "+c"(dst), "+m"(h)\
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
-        : "memory"\
-    );\
-}\
-\
-static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-  QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
-}\
-\
-static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5)         \
-}\
-\
-static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-  QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
-}\
-\
-static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
-}\
-static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
-    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}\
-\
-static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
-}\
-static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
-    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}\
-\
-static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
-}\
-static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
-    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}\
-\
-static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
-    OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
-    OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-}\
-
-#define CAVS_MC(OPNAME, SIZE, MMX) \
-static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
-}\
-
-#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "    \n\t"
-#define AVG_3DNOW_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "   \n\t"\
-"pavgusb " #temp ", " #a "        \n\t"\
-"mov" #size " " #a ", " #b "      \n\t"
-#define AVG_MMX2_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "   \n\t"\
-"pavgb " #temp ", " #a "          \n\t"\
-"mov" #size " " #a ", " #b "      \n\t"
-
-QPEL_CAVS(put_,       PUT_OP, 3dnow)
-QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
-QPEL_CAVS(put_,       PUT_OP, mmx2)
-QPEL_CAVS(avg_,  AVG_MMX2_OP, mmx2)
-
-CAVS_MC(put_, 8, 3dnow)
-CAVS_MC(put_, 16,3dnow)
-CAVS_MC(avg_, 8, 3dnow)
-CAVS_MC(avg_, 16,3dnow)
-CAVS_MC(put_, 8, mmx2)
-CAVS_MC(put_, 16,mmx2)
-CAVS_MC(avg_, 8, mmx2)
-CAVS_MC(avg_, 16,mmx2)
-
-void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
-void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
-void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
-void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
-
-void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx) {
-#define dspfunc(PFX, IDX, NUM) \
-    c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \
-    c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \
-    c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \
-    c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \
-    c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \
-
-    dspfunc(put_cavs_qpel, 0, 16);
-    dspfunc(put_cavs_qpel, 1, 8);
-    dspfunc(avg_cavs_qpel, 0, 16);
-    dspfunc(avg_cavs_qpel, 1, 8);
-#undef dspfunc
-    c->cavs_idct8_add = cavs_idct8_add_mmx;
-}
-
-void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx) {
-#define dspfunc(PFX, IDX, NUM) \
-    c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \
-    c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \
-    c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \
-    c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \
-    c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \
-
-    dspfunc(put_cavs_qpel, 0, 16);
-    dspfunc(put_cavs_qpel, 1, 8);
-    dspfunc(avg_cavs_qpel, 0, 16);
-    dspfunc(avg_cavs_qpel, 1, 8);
-#undef dspfunc
-    c->cavs_idct8_add = cavs_idct8_add_mmx;
-}
diff --git a/libavcodec/i386/cpuid.c b/libavcodec/i386/cpuid.c
deleted file mode 100644
index 2f2a669..0000000
--- a/libavcodec/i386/cpuid.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * CPU detection code, extracted from mmx.h
- * (c)1997-99 by H. Dietz and R. Fisher
- * Converted to C and improved by Fabrice Bellard.
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdlib.h>
-#include "libavutil/x86_cpu.h"
-#include "libavcodec/dsputil.h"
-
-#undef printf
-
-/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
-#define cpuid(index,eax,ebx,ecx,edx)\
-    __asm__ volatile\
-        ("mov %%"REG_b", %%"REG_S"\n\t"\
-         "cpuid\n\t"\
-         "xchg %%"REG_b", %%"REG_S\
-         : "=a" (eax), "=S" (ebx),\
-           "=c" (ecx), "=d" (edx)\
-         : "0" (index));
-
-/* Function to test if multimedia instructions are supported...  */
-int mm_support(void)
-{
-    int rval = 0;
-    int eax, ebx, ecx, edx;
-    int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
-    x86_reg a, c;
-
-    __asm__ volatile (
-        /* See if CPUID instruction is supported ... */
-        /* ... Get copies of EFLAGS into eax and ecx */
-        "pushf\n\t"
-        "pop %0\n\t"
-        "mov %0, %1\n\t"
-
-        /* ... Toggle the ID bit in one copy and store */
-        /*     to the EFLAGS reg */
-        "xor $0x200000, %0\n\t"
-        "push %0\n\t"
-        "popf\n\t"
-
-        /* ... Get the (hopefully modified) EFLAGS */
-        "pushf\n\t"
-        "pop %0\n\t"
-        : "=a" (a), "=c" (c)
-        :
-        : "cc"
-        );
-
-    if (a == c)
-        return 0; /* CPUID not supported */
-
-    cpuid(0, max_std_level, ebx, ecx, edx);
-
-    if(max_std_level >= 1){
-        cpuid(1, eax, ebx, ecx, std_caps);
-        if (std_caps & (1<<23))
-            rval |= FF_MM_MMX;
-        if (std_caps & (1<<25))
-            rval |= FF_MM_MMXEXT
-#if !defined(__GNUC__) || __GNUC__ > 2
-                  | FF_MM_SSE;
-        if (std_caps & (1<<26))
-            rval |= FF_MM_SSE2;
-        if (ecx & 1)
-            rval |= FF_MM_SSE3;
-        if (ecx & 0x00000200 )
-            rval |= FF_MM_SSSE3
-#endif
-                  ;
-    }
-
-    cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
-
-    if(max_ext_level >= 0x80000001){
-        cpuid(0x80000001, eax, ebx, ecx, ext_caps);
-        if (ext_caps & (1<<31))
-            rval |= FF_MM_3DNOW;
-        if (ext_caps & (1<<30))
-            rval |= FF_MM_3DNOWEXT;
-        if (ext_caps & (1<<23))
-            rval |= FF_MM_MMX;
-        if (ext_caps & (1<<22))
-            rval |= FF_MM_MMXEXT;
-    }
-
-#if 0
-    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s\n",
-        (rval&FF_MM_MMX) ? "MMX ":"",
-        (rval&FF_MM_MMXEXT) ? "MMX2 ":"",
-        (rval&FF_MM_SSE) ? "SSE ":"",
-        (rval&FF_MM_SSE2) ? "SSE2 ":"",
-        (rval&FF_MM_SSE3) ? "SSE3 ":"",
-        (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
-        (rval&FF_MM_3DNOW) ? "3DNow ":"",
-        (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":"");
-#endif
-    return rval;
-}
-
-#ifdef TEST
-int main ( void )
-{
-    int mm_flags;
-    mm_flags = mm_support();
-    printf("mm_support = 0x%08X\n",mm_flags);
-    return 0;
-}
-#endif
diff --git a/libavcodec/i386/dsputil_h264_template_mmx.c b/libavcodec/i386/dsputil_h264_template_mmx.c
deleted file mode 100644
index 0bf8732..0000000
--- a/libavcodec/i386/dsputil_h264_template_mmx.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
- *                    Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * MMX optimized version of (put|avg)_h264_chroma_mc8.
- * H264_CHROMA_MC8_TMPL must be defined to the desired function name
- * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
- * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
- */
-static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
-{
-    const uint64_t *rnd_reg;
-    DECLARE_ALIGNED_8(uint64_t, AA);
-    DECLARE_ALIGNED_8(uint64_t, DD);
-    int i;
-
-    if(y==0 && x==0) {
-        /* no filter needed */
-        H264_CHROMA_MC8_MV0(dst, src, stride, h);
-        return;
-    }
-
-    assert(x<8 && y<8 && x>=0 && y>=0);
-
-    if(y==0 || x==0)
-    {
-        /* 1 dimensional filter only */
-        const int dxy = x ? 1 : stride;
-
-        rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3;
-
-        __asm__ volatile(
-            "movd %0, %%mm5\n\t"
-            "movq %1, %%mm4\n\t"
-            "movq %2, %%mm6\n\t"         /* mm6 = rnd */
-            "punpcklwd %%mm5, %%mm5\n\t"
-            "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
-            "pxor %%mm7, %%mm7\n\t"
-            "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
-            :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg));
-
-        for(i=0; i<h; i++) {
-            __asm__ volatile(
-                /* mm0 = src[0..7], mm1 = src[1..8] */
-                "movq %0, %%mm0\n\t"
-                "movq %1, %%mm2\n\t"
-                :: "m"(src[0]), "m"(src[dxy]));
-
-            __asm__ volatile(
-                /* [mm0,mm1] = A * src[0..7] */
-                /* [mm2,mm3] = B * src[1..8] */
-                "movq %%mm0, %%mm1\n\t"
-                "movq %%mm2, %%mm3\n\t"
-                "punpcklbw %%mm7, %%mm0\n\t"
-                "punpckhbw %%mm7, %%mm1\n\t"
-                "punpcklbw %%mm7, %%mm2\n\t"
-                "punpckhbw %%mm7, %%mm3\n\t"
-                "pmullw %%mm4, %%mm0\n\t"
-                "pmullw %%mm4, %%mm1\n\t"
-                "pmullw %%mm5, %%mm2\n\t"
-                "pmullw %%mm5, %%mm3\n\t"
-
-                /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */
-                "paddw %%mm6, %%mm0\n\t"
-                "paddw %%mm6, %%mm1\n\t"
-                "paddw %%mm2, %%mm0\n\t"
-                "paddw %%mm3, %%mm1\n\t"
-                "psrlw $3, %%mm0\n\t"
-                "psrlw $3, %%mm1\n\t"
-                "packuswb %%mm1, %%mm0\n\t"
-                H264_CHROMA_OP(%0, %%mm0)
-                "movq %%mm0, %0\n\t"
-                : "=m" (dst[0]));
-
-            src += stride;
-            dst += stride;
-        }
-        return;
-    }
-
-    /* general case, bilinear */
-    rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a;
-    __asm__ volatile("movd %2, %%mm4\n\t"
-                 "movd %3, %%mm6\n\t"
-                 "punpcklwd %%mm4, %%mm4\n\t"
-                 "punpcklwd %%mm6, %%mm6\n\t"
-                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
-                 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
-                 "movq %%mm4, %%mm5\n\t"
-                 "pmullw %%mm6, %%mm4\n\t"    /* mm4 = x * y */
-                 "psllw $3, %%mm5\n\t"
-                 "psllw $3, %%mm6\n\t"
-                 "movq %%mm5, %%mm7\n\t"
-                 "paddw %%mm6, %%mm7\n\t"
-                 "movq %%mm4, %1\n\t"         /* DD = x * y */
-                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
-                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
-                 "paddw %4, %%mm4\n\t"
-                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
-                 "pxor %%mm7, %%mm7\n\t"
-                 "movq %%mm4, %0\n\t"
-                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
-
-    __asm__ volatile(
-        /* mm0 = src[0..7], mm1 = src[1..8] */
-        "movq %0, %%mm0\n\t"
-        "movq %1, %%mm1\n\t"
-        : : "m" (src[0]), "m" (src[1]));
-
-    for(i=0; i<h; i++) {
-        src += stride;
-
-        __asm__ volatile(
-            /* mm2 = A * src[0..3] + B * src[1..4] */
-            /* mm3 = A * src[4..7] + B * src[5..8] */
-            "movq %%mm0, %%mm2\n\t"
-            "movq %%mm1, %%mm3\n\t"
-            "punpckhbw %%mm7, %%mm0\n\t"
-            "punpcklbw %%mm7, %%mm1\n\t"
-            "punpcklbw %%mm7, %%mm2\n\t"
-            "punpckhbw %%mm7, %%mm3\n\t"
-            "pmullw %0, %%mm0\n\t"
-            "pmullw %0, %%mm2\n\t"
-            "pmullw %%mm5, %%mm1\n\t"
-            "pmullw %%mm5, %%mm3\n\t"
-            "paddw %%mm1, %%mm2\n\t"
-            "paddw %%mm0, %%mm3\n\t"
-            : : "m" (AA));
-
-        __asm__ volatile(
-            /* [mm2,mm3] += C * src[0..7] */
-            "movq %0, %%mm0\n\t"
-            "movq %%mm0, %%mm1\n\t"
-            "punpcklbw %%mm7, %%mm0\n\t"
-            "punpckhbw %%mm7, %%mm1\n\t"
-            "pmullw %%mm6, %%mm0\n\t"
-            "pmullw %%mm6, %%mm1\n\t"
-            "paddw %%mm0, %%mm2\n\t"
-            "paddw %%mm1, %%mm3\n\t"
-            : : "m" (src[0]));
-
-        __asm__ volatile(
-            /* [mm2,mm3] += D * src[1..8] */
-            "movq %1, %%mm1\n\t"
-            "movq %%mm1, %%mm0\n\t"
-            "movq %%mm1, %%mm4\n\t"
-            "punpcklbw %%mm7, %%mm0\n\t"
-            "punpckhbw %%mm7, %%mm4\n\t"
-            "pmullw %2, %%mm0\n\t"
-            "pmullw %2, %%mm4\n\t"
-            "paddw %%mm0, %%mm2\n\t"
-            "paddw %%mm4, %%mm3\n\t"
-            "movq %0, %%mm0\n\t"
-            : : "m" (src[0]), "m" (src[1]), "m" (DD));
-
-        __asm__ volatile(
-            /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */
-            "paddw %1, %%mm2\n\t"
-            "paddw %1, %%mm3\n\t"
-            "psrlw $6, %%mm2\n\t"
-            "psrlw $6, %%mm3\n\t"
-            "packuswb %%mm3, %%mm2\n\t"
-            H264_CHROMA_OP(%0, %%mm2)
-            "movq %%mm2, %0\n\t"
-            : "=m" (dst[0]) : "m" (*rnd_reg));
-        dst+= stride;
-    }
-}
-
-static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    __asm__ volatile(
-        "pxor   %%mm7, %%mm7        \n\t"
-        "movd %5, %%mm2             \n\t"
-        "movd %6, %%mm3             \n\t"
-        "movq "MANGLE(ff_pw_8)", %%mm4\n\t"
-        "movq "MANGLE(ff_pw_8)", %%mm5\n\t"
-        "punpcklwd %%mm2, %%mm2     \n\t"
-        "punpcklwd %%mm3, %%mm3     \n\t"
-        "punpcklwd %%mm2, %%mm2     \n\t"
-        "punpcklwd %%mm3, %%mm3     \n\t"
-        "psubw %%mm2, %%mm4         \n\t"
-        "psubw %%mm3, %%mm5         \n\t"
-
-        "movd  (%1), %%mm0          \n\t"
-        "movd 1(%1), %%mm6          \n\t"
-        "add %3, %1                 \n\t"
-        "punpcklbw %%mm7, %%mm0     \n\t"
-        "punpcklbw %%mm7, %%mm6     \n\t"
-        "pmullw %%mm4, %%mm0        \n\t"
-        "pmullw %%mm2, %%mm6        \n\t"
-        "paddw %%mm0, %%mm6         \n\t"
-
-        "1:                         \n\t"
-        "movd  (%1), %%mm0          \n\t"
-        "movd 1(%1), %%mm1          \n\t"
-        "add %3, %1                 \n\t"
-        "punpcklbw %%mm7, %%mm0     \n\t"
-        "punpcklbw %%mm7, %%mm1     \n\t"
-        "pmullw %%mm4, %%mm0        \n\t"
-        "pmullw %%mm2, %%mm1        \n\t"
-        "paddw %%mm0, %%mm1         \n\t"
-        "movq %%mm1, %%mm0          \n\t"
-        "pmullw %%mm5, %%mm6        \n\t"
-        "pmullw %%mm3, %%mm1        \n\t"
-        "paddw %4, %%mm6            \n\t"
-        "paddw %%mm6, %%mm1         \n\t"
-        "psrlw $6, %%mm1            \n\t"
-        "packuswb %%mm1, %%mm1      \n\t"
-        H264_CHROMA_OP4((%0), %%mm1, %%mm6)
-        "movd %%mm1, (%0)           \n\t"
-        "add %3, %0                 \n\t"
-        "movd  (%1), %%mm6          \n\t"
-        "movd 1(%1), %%mm1          \n\t"
-        "add %3, %1                 \n\t"
-        "punpcklbw %%mm7, %%mm6     \n\t"
-        "punpcklbw %%mm7, %%mm1     \n\t"
-        "pmullw %%mm4, %%mm6        \n\t"
-        "pmullw %%mm2, %%mm1        \n\t"
-        "paddw %%mm6, %%mm1         \n\t"
-        "movq %%mm1, %%mm6          \n\t"
-        "pmullw %%mm5, %%mm0        \n\t"
-        "pmullw %%mm3, %%mm1        \n\t"
-        "paddw %4, %%mm0            \n\t"
-        "paddw %%mm0, %%mm1         \n\t"
-        "psrlw $6, %%mm1            \n\t"
-        "packuswb %%mm1, %%mm1      \n\t"
-        H264_CHROMA_OP4((%0), %%mm1, %%mm0)
-        "movd %%mm1, (%0)           \n\t"
-        "add %3, %0                 \n\t"
-        "sub $2, %2                 \n\t"
-        "jnz 1b                     \n\t"
-        : "+r"(dst), "+r"(src), "+r"(h)
-        : "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y)
-    );
-}
-
-#ifdef H264_CHROMA_MC2_TMPL
-static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    int tmp = ((1<<16)-1)*x + 8;
-    int CD= tmp*y;
-    int AB= (tmp<<3) - CD;
-    __asm__ volatile(
-        /* mm5 = {A,B,A,B} */
-        /* mm6 = {C,D,C,D} */
-        "movd %0, %%mm5\n\t"
-        "movd %1, %%mm6\n\t"
-        "punpckldq %%mm5, %%mm5\n\t"
-        "punpckldq %%mm6, %%mm6\n\t"
-        "pxor %%mm7, %%mm7\n\t"
-        /* mm0 = src[0,1,1,2] */
-        "movd %2, %%mm2\n\t"
-        "punpcklbw %%mm7, %%mm2\n\t"
-        "pshufw $0x94, %%mm2, %%mm2\n\t"
-        :: "r"(AB), "r"(CD), "m"(src[0]));
-
-
-    __asm__ volatile(
-        "1:\n\t"
-        "add %4, %1\n\t"
-        /* mm1 = A * src[0,1] + B * src[1,2] */
-        "movq    %%mm2, %%mm1\n\t"
-        "pmaddwd %%mm5, %%mm1\n\t"
-        /* mm0 = src[0,1,1,2] */
-        "movd (%1), %%mm0\n\t"
-        "punpcklbw %%mm7, %%mm0\n\t"
-        "pshufw $0x94, %%mm0, %%mm0\n\t"
-        /* mm1 += C * src[0,1] + D * src[1,2] */
-        "movq    %%mm0, %%mm2\n\t"
-        "pmaddwd %%mm6, %%mm0\n\t"
-        "paddw      %3, %%mm1\n\t"
-        "paddw   %%mm0, %%mm1\n\t"
-        /* dst[0,1] = pack((mm1 + 32) >> 6) */
-        "psrlw $6, %%mm1\n\t"
-        "packssdw %%mm7, %%mm1\n\t"
-        "packuswb %%mm7, %%mm1\n\t"
-        H264_CHROMA_OP4((%0), %%mm1, %%mm3)
-        "movd %%mm1, %%esi\n\t"
-        "movw %%si, (%0)\n\t"
-        "add %4, %0\n\t"
-        "sub $1, %2\n\t"
-        "jnz 1b\n\t"
-        : "+r" (dst), "+r"(src), "+r"(h)
-        : "m" (ff_pw_32), "r"((x86_reg)stride)
-        : "%esi");
-
-}
-#endif
-
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
deleted file mode 100644
index 161afab..0000000
--- a/libavcodec/i386/dsputil_mmx.c
+++ /dev/null
@@ -1,2916 +0,0 @@
-/*
- * MMX optimized DSP utils
- * Copyright (c) 2000, 2001 Fabrice Bellard.
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
- */
-
-#include "libavutil/x86_cpu.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/h263.h"
-#include "libavcodec/mpegvideo.h"
-#include "libavcodec/simple_idct.h"
-#include "dsputil_mmx.h"
-#include "mmx.h"
-#include "vp3dsp_mmx.h"
-#include "vp3dsp_sse2.h"
-#include "idct_xvid.h"
-
-//#undef NDEBUG
-//#include <assert.h>
-
-int mm_flags; /* multimedia extension flags */
-
-/* pixel operations */
-DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-
-DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
-{0x8000000080000000ULL, 0x8000000080000000ULL};
-
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
-DECLARE_ALIGNED_16(const xmm_t,    ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
-DECLARE_ALIGNED_16(const xmm_t,    ff_pw_8  ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
-DECLARE_ALIGNED_16(const xmm_t,    ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
-DECLARE_ALIGNED_16(const xmm_t,    ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
-DECLARE_ALIGNED_16(const xmm_t,    ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
-
-DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
-DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
-
-DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
-DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
-
-#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
-#define MOVQ_ZERO(regd)  __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
-
-#define MOVQ_BFE(regd) \
-    __asm__ volatile ( \
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
-    "paddb %%" #regd ", %%" #regd " \n\t" ::)
-
-#ifndef PIC
-#define MOVQ_BONE(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
-#define MOVQ_WTWO(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
-#else
-// for shared library it's better to use this way for accessing constants
-// pcmpeqd -> -1
-#define MOVQ_BONE(regd) \
-    __asm__ volatile ( \
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
-    "psrlw $15, %%" #regd " \n\t" \
-    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
-
-#define MOVQ_WTWO(regd) \
-    __asm__ volatile ( \
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
-    "psrlw $15, %%" #regd " \n\t" \
-    "psllw $1, %%" #regd " \n\t"::)
-
-#endif
-
-// using regr as temporary and for the output result
-// first argument is unmodifed and second is trashed
-// regfe is supposed to contain 0xfefefefefefefefe
-#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
-    "movq " #rega ", " #regr "  \n\t"\
-    "pand " #regb ", " #regr "  \n\t"\
-    "pxor " #rega ", " #regb "  \n\t"\
-    "pand " #regfe "," #regb "  \n\t"\
-    "psrlq $1, " #regb "        \n\t"\
-    "paddb " #regb ", " #regr " \n\t"
-
-#define PAVGB_MMX(rega, regb, regr, regfe) \
-    "movq " #rega ", " #regr "  \n\t"\
-    "por  " #regb ", " #regr "  \n\t"\
-    "pxor " #rega ", " #regb "  \n\t"\
-    "pand " #regfe "," #regb "  \n\t"\
-    "psrlq $1, " #regb "        \n\t"\
-    "psubb " #regb ", " #regr " \n\t"
-
-// mm6 is supposed to contain 0xfefefefefefefefe
-#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
-    "movq " #rega ", " #regr "  \n\t"\
-    "movq " #regc ", " #regp "  \n\t"\
-    "pand " #regb ", " #regr "  \n\t"\
-    "pand " #regd ", " #regp "  \n\t"\
-    "pxor " #rega ", " #regb "  \n\t"\
-    "pxor " #regc ", " #regd "  \n\t"\
-    "pand %%mm6, " #regb "      \n\t"\
-    "pand %%mm6, " #regd "      \n\t"\
-    "psrlq $1, " #regb "        \n\t"\
-    "psrlq $1, " #regd "        \n\t"\
-    "paddb " #regb ", " #regr " \n\t"\
-    "paddb " #regd ", " #regp " \n\t"
-
-#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
-    "movq " #rega ", " #regr "  \n\t"\
-    "movq " #regc ", " #regp "  \n\t"\
-    "por  " #regb ", " #regr "  \n\t"\
-    "por  " #regd ", " #regp "  \n\t"\
-    "pxor " #rega ", " #regb "  \n\t"\
-    "pxor " #regc ", " #regd "  \n\t"\
-    "pand %%mm6, " #regb "      \n\t"\
-    "pand %%mm6, " #regd "      \n\t"\
-    "psrlq $1, " #regd "        \n\t"\
-    "psrlq $1, " #regb "        \n\t"\
-    "psubb " #regb ", " #regr " \n\t"\
-    "psubb " #regd ", " #regp " \n\t"
-
-/***********************************/
-/* MMX no rounding */
-#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
-#define SET_RND  MOVQ_WONE
-#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
-#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
-
-#include "dsputil_mmx_rnd_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef PAVGBP
-#undef PAVGB
-/***********************************/
-/* MMX rounding */
-
-#define DEF(x, y) x ## _ ## y ##_mmx
-#define SET_RND  MOVQ_WTWO
-#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
-#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
-
-#include "dsputil_mmx_rnd_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef PAVGBP
-#undef PAVGB
-
-/***********************************/
-/* 3Dnow specific */
-
-#define DEF(x) x ## _3dnow
-#define PAVGB "pavgusb"
-
-#include "dsputil_mmx_avg_template.c"
-
-#undef DEF
-#undef PAVGB
-
-/***********************************/
-/* MMX2 specific */
-
-#define DEF(x) x ## _mmx2
-
-/* Introduced only in MMX2 set */
-#define PAVGB "pavgb"
-
-#include "dsputil_mmx_avg_template.c"
-
-#undef DEF
-#undef PAVGB
-
-#define put_no_rnd_pixels16_mmx put_pixels16_mmx
-#define put_no_rnd_pixels8_mmx put_pixels8_mmx
-#define put_pixels16_mmx2 put_pixels16_mmx
-#define put_pixels8_mmx2 put_pixels8_mmx
-#define put_pixels4_mmx2 put_pixels4_mmx
-#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
-#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
-#define put_pixels16_3dnow put_pixels16_mmx
-#define put_pixels8_3dnow put_pixels8_mmx
-#define put_pixels4_3dnow put_pixels4_mmx
-#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
-#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
-
-/***********************************/
-/* standard MMX */
-
-void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
-{
-    const DCTELEM *p;
-    uint8_t *pix;
-
-    /* read the pixels */
-    p = block;
-    pix = pixels;
-    /* unrolled loop */
-        __asm__ volatile(
-                "movq   %3, %%mm0               \n\t"
-                "movq   8%3, %%mm1              \n\t"
-                "movq   16%3, %%mm2             \n\t"
-                "movq   24%3, %%mm3             \n\t"
-                "movq   32%3, %%mm4             \n\t"
-                "movq   40%3, %%mm5             \n\t"
-                "movq   48%3, %%mm6             \n\t"
-                "movq   56%3, %%mm7             \n\t"
-                "packuswb %%mm1, %%mm0          \n\t"
-                "packuswb %%mm3, %%mm2          \n\t"
-                "packuswb %%mm5, %%mm4          \n\t"
-                "packuswb %%mm7, %%mm6          \n\t"
-                "movq   %%mm0, (%0)             \n\t"
-                "movq   %%mm2, (%0, %1)         \n\t"
-                "movq   %%mm4, (%0, %1, 2)      \n\t"
-                "movq   %%mm6, (%0, %2)         \n\t"
-                ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
-                :"memory");
-        pix += line_size*4;
-        p += 32;
-
-    // if here would be an exact copy of the code above
-    // compiler would generate some very strange code
-    // thus using "r"
-    __asm__ volatile(
-            "movq       (%3), %%mm0             \n\t"
-            "movq       8(%3), %%mm1            \n\t"
-            "movq       16(%3), %%mm2           \n\t"
-            "movq       24(%3), %%mm3           \n\t"
-            "movq       32(%3), %%mm4           \n\t"
-            "movq       40(%3), %%mm5           \n\t"
-            "movq       48(%3), %%mm6           \n\t"
-            "movq       56(%3), %%mm7           \n\t"
-            "packuswb %%mm1, %%mm0              \n\t"
-            "packuswb %%mm3, %%mm2              \n\t"
-            "packuswb %%mm5, %%mm4              \n\t"
-            "packuswb %%mm7, %%mm6              \n\t"
-            "movq       %%mm0, (%0)             \n\t"
-            "movq       %%mm2, (%0, %1)         \n\t"
-            "movq       %%mm4, (%0, %1, 2)      \n\t"
-            "movq       %%mm6, (%0, %2)         \n\t"
-            ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
-            :"memory");
-}
-
-static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
-  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-
-void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
-{
-    int i;
-
-    movq_m2r(*vector128, mm1);
-    for (i = 0; i < 8; i++) {
-        movq_m2r(*(block), mm0);
-        packsswb_m2r(*(block + 4), mm0);
-        block += 8;
-        paddb_r2r(mm1, mm0);
-        movq_r2m(mm0, *pixels);
-        pixels += line_size;
-    }
-}
-
-void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
-{
-    const DCTELEM *p;
-    uint8_t *pix;
-    int i;
-
-    /* read the pixels */
-    p = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    i = 4;
-    do {
-        __asm__ volatile(
-                "movq   (%2), %%mm0     \n\t"
-                "movq   8(%2), %%mm1    \n\t"
-                "movq   16(%2), %%mm2   \n\t"
-                "movq   24(%2), %%mm3   \n\t"
-                "movq   %0, %%mm4       \n\t"
-                "movq   %1, %%mm6       \n\t"
-                "movq   %%mm4, %%mm5    \n\t"
-                "punpcklbw %%mm7, %%mm4 \n\t"
-                "punpckhbw %%mm7, %%mm5 \n\t"
-                "paddsw %%mm4, %%mm0    \n\t"
-                "paddsw %%mm5, %%mm1    \n\t"
-                "movq   %%mm6, %%mm5    \n\t"
-                "punpcklbw %%mm7, %%mm6 \n\t"
-                "punpckhbw %%mm7, %%mm5 \n\t"
-                "paddsw %%mm6, %%mm2    \n\t"
-                "paddsw %%mm5, %%mm3    \n\t"
-                "packuswb %%mm1, %%mm0  \n\t"
-                "packuswb %%mm3, %%mm2  \n\t"
-                "movq   %%mm0, %0       \n\t"
-                "movq   %%mm2, %1       \n\t"
-                :"+m"(*pix), "+m"(*(pix+line_size))
-                :"r"(p)
-                :"memory");
-        pix += line_size*2;
-        p += 16;
-    } while (--i);
-}
-
-static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-         "lea (%3, %3), %%"REG_a"       \n\t"
-         ASMALIGN(3)
-         "1:                            \n\t"
-         "movd (%1), %%mm0              \n\t"
-         "movd (%1, %3), %%mm1          \n\t"
-         "movd %%mm0, (%2)              \n\t"
-         "movd %%mm1, (%2, %3)          \n\t"
-         "add %%"REG_a", %1             \n\t"
-         "add %%"REG_a", %2             \n\t"
-         "movd (%1), %%mm0              \n\t"
-         "movd (%1, %3), %%mm1          \n\t"
-         "movd %%mm0, (%2)              \n\t"
-         "movd %%mm1, (%2, %3)          \n\t"
-         "add %%"REG_a", %1             \n\t"
-         "add %%"REG_a", %2             \n\t"
-         "subl $4, %0                   \n\t"
-         "jnz 1b                        \n\t"
-         : "+g"(h), "+r" (pixels),  "+r" (block)
-         : "r"((x86_reg)line_size)
-         : "%"REG_a, "memory"
-        );
-}
-
-static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-         "lea (%3, %3), %%"REG_a"       \n\t"
-         ASMALIGN(3)
-         "1:                            \n\t"
-         "movq (%1), %%mm0              \n\t"
-         "movq (%1, %3), %%mm1          \n\t"
-         "movq %%mm0, (%2)              \n\t"
-         "movq %%mm1, (%2, %3)          \n\t"
-         "add %%"REG_a", %1             \n\t"
-         "add %%"REG_a", %2             \n\t"
-         "movq (%1), %%mm0              \n\t"
-         "movq (%1, %3), %%mm1          \n\t"
-         "movq %%mm0, (%2)              \n\t"
-         "movq %%mm1, (%2, %3)          \n\t"
-         "add %%"REG_a", %1             \n\t"
-         "add %%"REG_a", %2             \n\t"
-         "subl $4, %0                   \n\t"
-         "jnz 1b                        \n\t"
-         : "+g"(h), "+r" (pixels),  "+r" (block)
-         : "r"((x86_reg)line_size)
-         : "%"REG_a, "memory"
-        );
-}
-
-static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-         "lea (%3, %3), %%"REG_a"       \n\t"
-         ASMALIGN(3)
-         "1:                            \n\t"
-         "movq (%1), %%mm0              \n\t"
-         "movq 8(%1), %%mm4             \n\t"
-         "movq (%1, %3), %%mm1          \n\t"
-         "movq 8(%1, %3), %%mm5         \n\t"
-         "movq %%mm0, (%2)              \n\t"
-         "movq %%mm4, 8(%2)             \n\t"
-         "movq %%mm1, (%2, %3)          \n\t"
-         "movq %%mm5, 8(%2, %3)         \n\t"
-         "add %%"REG_a", %1             \n\t"
-         "add %%"REG_a", %2             \n\t"
-         "movq (%1), %%mm0              \n\t"
-         "movq 8(%1), %%mm4             \n\t"
-         "movq (%1, %3), %%mm1          \n\t"
-         "movq 8(%1, %3), %%mm5         \n\t"
-         "movq %%mm0, (%2)              \n\t"
-         "movq %%mm4, 8(%2)             \n\t"
-         "movq %%mm1, (%2, %3)          \n\t"
-         "movq %%mm5, 8(%2, %3)         \n\t"
-         "add %%"REG_a", %1             \n\t"
-         "add %%"REG_a", %2             \n\t"
-         "subl $4, %0                   \n\t"
-         "jnz 1b                        \n\t"
-         : "+g"(h), "+r" (pixels),  "+r" (block)
-         : "r"((x86_reg)line_size)
-         : "%"REG_a, "memory"
-        );
-}
-
-static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-         "1:                            \n\t"
-         "movdqu (%1), %%xmm0           \n\t"
-         "movdqu (%1,%3), %%xmm1        \n\t"
-         "movdqu (%1,%3,2), %%xmm2      \n\t"
-         "movdqu (%1,%4), %%xmm3        \n\t"
-         "movdqa %%xmm0, (%2)           \n\t"
-         "movdqa %%xmm1, (%2,%3)        \n\t"
-         "movdqa %%xmm2, (%2,%3,2)      \n\t"
-         "movdqa %%xmm3, (%2,%4)        \n\t"
-         "subl $4, %0                   \n\t"
-         "lea (%1,%3,4), %1             \n\t"
-         "lea (%2,%3,4), %2             \n\t"
-         "jnz 1b                        \n\t"
-         : "+g"(h), "+r" (pixels),  "+r" (block)
-         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
-         : "memory"
-        );
-}
-
-static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-         "1:                            \n\t"
-         "movdqu (%1), %%xmm0           \n\t"
-         "movdqu (%1,%3), %%xmm1        \n\t"
-         "movdqu (%1,%3,2), %%xmm2      \n\t"
-         "movdqu (%1,%4), %%xmm3        \n\t"
-         "pavgb  (%2), %%xmm0           \n\t"
-         "pavgb  (%2,%3), %%xmm1        \n\t"
-         "pavgb  (%2,%3,2), %%xmm2      \n\t"
-         "pavgb  (%2,%4), %%xmm3        \n\t"
-         "movdqa %%xmm0, (%2)           \n\t"
-         "movdqa %%xmm1, (%2,%3)        \n\t"
-         "movdqa %%xmm2, (%2,%3,2)      \n\t"
-         "movdqa %%xmm3, (%2,%4)        \n\t"
-         "subl $4, %0                   \n\t"
-         "lea (%1,%3,4), %1             \n\t"
-         "lea (%2,%3,4), %2             \n\t"
-         "jnz 1b                        \n\t"
-         : "+g"(h), "+r" (pixels),  "+r" (block)
-         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
-         : "memory"
-        );
-}
-
-static void clear_blocks_mmx(DCTELEM *blocks)
-{
-    __asm__ volatile(
-                "pxor %%mm7, %%mm7              \n\t"
-                "mov $-128*6, %%"REG_a"         \n\t"
-                "1:                             \n\t"
-                "movq %%mm7, (%0, %%"REG_a")    \n\t"
-                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
-                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
-                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
-                "add $32, %%"REG_a"             \n\t"
-                " js 1b                         \n\t"
-                : : "r" (((uint8_t *)blocks)+128*6)
-                : "%"REG_a
-        );
-}
-
-static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
-    x86_reg i=0;
-    __asm__ volatile(
-        "jmp 2f                         \n\t"
-        "1:                             \n\t"
-        "movq  (%1, %0), %%mm0          \n\t"
-        "movq  (%2, %0), %%mm1          \n\t"
-        "paddb %%mm0, %%mm1             \n\t"
-        "movq %%mm1, (%2, %0)           \n\t"
-        "movq 8(%1, %0), %%mm0          \n\t"
-        "movq 8(%2, %0), %%mm1          \n\t"
-        "paddb %%mm0, %%mm1             \n\t"
-        "movq %%mm1, 8(%2, %0)          \n\t"
-        "add $16, %0                    \n\t"
-        "2:                             \n\t"
-        "cmp %3, %0                     \n\t"
-        " js 1b                         \n\t"
-        : "+r" (i)
-        : "r"(src), "r"(dst), "r"((x86_reg)w-15)
-    );
-    for(; i<w; i++)
-        dst[i+0] += src[i+0];
-}
-
-static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
-    x86_reg i=0;
-    __asm__ volatile(
-        "jmp 2f                         \n\t"
-        "1:                             \n\t"
-        "movq   (%2, %0), %%mm0         \n\t"
-        "movq  8(%2, %0), %%mm1         \n\t"
-        "paddb  (%3, %0), %%mm0         \n\t"
-        "paddb 8(%3, %0), %%mm1         \n\t"
-        "movq %%mm0,  (%1, %0)          \n\t"
-        "movq %%mm1, 8(%1, %0)          \n\t"
-        "add $16, %0                    \n\t"
-        "2:                             \n\t"
-        "cmp %4, %0                     \n\t"
-        " js 1b                         \n\t"
-        : "+r" (i)
-        : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
-    );
-    for(; i<w; i++)
-        dst[i] = src1[i] + src2[i];
-}
-
-#define H263_LOOP_FILTER \
-        "pxor %%mm7, %%mm7              \n\t"\
-        "movq  %0, %%mm0                \n\t"\
-        "movq  %0, %%mm1                \n\t"\
-        "movq  %3, %%mm2                \n\t"\
-        "movq  %3, %%mm3                \n\t"\
-        "punpcklbw %%mm7, %%mm0         \n\t"\
-        "punpckhbw %%mm7, %%mm1         \n\t"\
-        "punpcklbw %%mm7, %%mm2         \n\t"\
-        "punpckhbw %%mm7, %%mm3         \n\t"\
-        "psubw %%mm2, %%mm0             \n\t"\
-        "psubw %%mm3, %%mm1             \n\t"\
-        "movq  %1, %%mm2                \n\t"\
-        "movq  %1, %%mm3                \n\t"\
-        "movq  %2, %%mm4                \n\t"\
-        "movq  %2, %%mm5                \n\t"\
-        "punpcklbw %%mm7, %%mm2         \n\t"\
-        "punpckhbw %%mm7, %%mm3         \n\t"\
-        "punpcklbw %%mm7, %%mm4         \n\t"\
-        "punpckhbw %%mm7, %%mm5         \n\t"\
-        "psubw %%mm2, %%mm4             \n\t"\
-        "psubw %%mm3, %%mm5             \n\t"\
-        "psllw $2, %%mm4                \n\t"\
-        "psllw $2, %%mm5                \n\t"\
-        "paddw %%mm0, %%mm4             \n\t"\
-        "paddw %%mm1, %%mm5             \n\t"\
-        "pxor %%mm6, %%mm6              \n\t"\
-        "pcmpgtw %%mm4, %%mm6           \n\t"\
-        "pcmpgtw %%mm5, %%mm7           \n\t"\
-        "pxor %%mm6, %%mm4              \n\t"\
-        "pxor %%mm7, %%mm5              \n\t"\
-        "psubw %%mm6, %%mm4             \n\t"\
-        "psubw %%mm7, %%mm5             \n\t"\
-        "psrlw $3, %%mm4                \n\t"\
-        "psrlw $3, %%mm5                \n\t"\
-        "packuswb %%mm5, %%mm4          \n\t"\
-        "packsswb %%mm7, %%mm6          \n\t"\
-        "pxor %%mm7, %%mm7              \n\t"\
-        "movd %4, %%mm2                 \n\t"\
-        "punpcklbw %%mm2, %%mm2         \n\t"\
-        "punpcklbw %%mm2, %%mm2         \n\t"\
-        "punpcklbw %%mm2, %%mm2         \n\t"\
-        "psubusb %%mm4, %%mm2           \n\t"\
-        "movq %%mm2, %%mm3              \n\t"\
-        "psubusb %%mm4, %%mm3           \n\t"\
-        "psubb %%mm3, %%mm2             \n\t"\
-        "movq %1, %%mm3                 \n\t"\
-        "movq %2, %%mm4                 \n\t"\
-        "pxor %%mm6, %%mm3              \n\t"\
-        "pxor %%mm6, %%mm4              \n\t"\
-        "paddusb %%mm2, %%mm3           \n\t"\
-        "psubusb %%mm2, %%mm4           \n\t"\
-        "pxor %%mm6, %%mm3              \n\t"\
-        "pxor %%mm6, %%mm4              \n\t"\
-        "paddusb %%mm2, %%mm2           \n\t"\
-        "packsswb %%mm1, %%mm0          \n\t"\
-        "pcmpgtb %%mm0, %%mm7           \n\t"\
-        "pxor %%mm7, %%mm0              \n\t"\
-        "psubb %%mm7, %%mm0             \n\t"\
-        "movq %%mm0, %%mm1              \n\t"\
-        "psubusb %%mm2, %%mm0           \n\t"\
-        "psubb %%mm0, %%mm1             \n\t"\
-        "pand %5, %%mm1                 \n\t"\
-        "psrlw $2, %%mm1                \n\t"\
-        "pxor %%mm7, %%mm1              \n\t"\
-        "psubb %%mm7, %%mm1             \n\t"\
-        "movq %0, %%mm5                 \n\t"\
-        "movq %3, %%mm6                 \n\t"\
-        "psubb %%mm1, %%mm5             \n\t"\
-        "paddb %%mm1, %%mm6             \n\t"
-
-static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
-    if(ENABLE_ANY_H263) {
-    const int strength= ff_h263_loop_filter_strength[qscale];
-
-    __asm__ volatile(
-
-        H263_LOOP_FILTER
-
-        "movq %%mm3, %1                 \n\t"
-        "movq %%mm4, %2                 \n\t"
-        "movq %%mm5, %0                 \n\t"
-        "movq %%mm6, %3                 \n\t"
-        : "+m" (*(uint64_t*)(src - 2*stride)),
-          "+m" (*(uint64_t*)(src - 1*stride)),
-          "+m" (*(uint64_t*)(src + 0*stride)),
-          "+m" (*(uint64_t*)(src + 1*stride))
-        : "g" (2*strength), "m"(ff_pb_FC)
-    );
-    }
-}
-
-static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
-    __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
-        "movd  %4, %%mm0                \n\t"
-        "movd  %5, %%mm1                \n\t"
-        "movd  %6, %%mm2                \n\t"
-        "movd  %7, %%mm3                \n\t"
-        "punpcklbw %%mm1, %%mm0         \n\t"
-        "punpcklbw %%mm3, %%mm2         \n\t"
-        "movq %%mm0, %%mm1              \n\t"
-        "punpcklwd %%mm2, %%mm0         \n\t"
-        "punpckhwd %%mm2, %%mm1         \n\t"
-        "movd  %%mm0, %0                \n\t"
-        "punpckhdq %%mm0, %%mm0         \n\t"
-        "movd  %%mm0, %1                \n\t"
-        "movd  %%mm1, %2                \n\t"
-        "punpckhdq %%mm1, %%mm1         \n\t"
-        "movd  %%mm1, %3                \n\t"
-
-        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 3*dst_stride))
-        :  "m" (*(uint32_t*)(src + 0*src_stride)),
-           "m" (*(uint32_t*)(src + 1*src_stride)),
-           "m" (*(uint32_t*)(src + 2*src_stride)),
-           "m" (*(uint32_t*)(src + 3*src_stride))
-    );
-}
-
-static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
-    if(ENABLE_ANY_H263) {
-    const int strength= ff_h263_loop_filter_strength[qscale];
-    DECLARE_ALIGNED(8, uint64_t, temp[4]);
-    uint8_t *btemp= (uint8_t*)temp;
-
-    src -= 2;
-
-    transpose4x4(btemp  , src           , 8, stride);
-    transpose4x4(btemp+4, src + 4*stride, 8, stride);
-    __asm__ volatile(
-        H263_LOOP_FILTER // 5 3 4 6
-
-        : "+m" (temp[0]),
-          "+m" (temp[1]),
-          "+m" (temp[2]),
-          "+m" (temp[3])
-        : "g" (2*strength), "m"(ff_pb_FC)
-    );
-
-    __asm__ volatile(
-        "movq %%mm5, %%mm1              \n\t"
-        "movq %%mm4, %%mm0              \n\t"
-        "punpcklbw %%mm3, %%mm5         \n\t"
-        "punpcklbw %%mm6, %%mm4         \n\t"
-        "punpckhbw %%mm3, %%mm1         \n\t"
-        "punpckhbw %%mm6, %%mm0         \n\t"
-        "movq %%mm5, %%mm3              \n\t"
-        "movq %%mm1, %%mm6              \n\t"
-        "punpcklwd %%mm4, %%mm5         \n\t"
-        "punpcklwd %%mm0, %%mm1         \n\t"
-        "punpckhwd %%mm4, %%mm3         \n\t"
-        "punpckhwd %%mm0, %%mm6         \n\t"
-        "movd %%mm5, (%0)               \n\t"
-        "punpckhdq %%mm5, %%mm5         \n\t"
-        "movd %%mm5, (%0,%2)            \n\t"
-        "movd %%mm3, (%0,%2,2)          \n\t"
-        "punpckhdq %%mm3, %%mm3         \n\t"
-        "movd %%mm3, (%0,%3)            \n\t"
-        "movd %%mm1, (%1)               \n\t"
-        "punpckhdq %%mm1, %%mm1         \n\t"
-        "movd %%mm1, (%1,%2)            \n\t"
-        "movd %%mm6, (%1,%2,2)          \n\t"
-        "punpckhdq %%mm6, %%mm6         \n\t"
-        "movd %%mm6, (%1,%3)            \n\t"
-        :: "r" (src),
-           "r" (src + 4*stride),
-           "r" ((x86_reg)   stride ),
-           "r" ((x86_reg)(3*stride))
-    );
-    }
-}
-
-/* draw the edges of width 'w' of an image of size width, height
-   this mmx version can only handle w==8 || w==16 */
-static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
-{
-    uint8_t *ptr, *last_line;
-    int i;
-
-    last_line = buf + (height - 1) * wrap;
-    /* left and right */
-    ptr = buf;
-    if(w==8)
-    {
-        __asm__ volatile(
-                "1:                             \n\t"
-                "movd (%0), %%mm0               \n\t"
-                "punpcklbw %%mm0, %%mm0         \n\t"
-                "punpcklwd %%mm0, %%mm0         \n\t"
-                "punpckldq %%mm0, %%mm0         \n\t"
-                "movq %%mm0, -8(%0)             \n\t"
-                "movq -8(%0, %2), %%mm1         \n\t"
-                "punpckhbw %%mm1, %%mm1         \n\t"
-                "punpckhwd %%mm1, %%mm1         \n\t"
-                "punpckhdq %%mm1, %%mm1         \n\t"
-                "movq %%mm1, (%0, %2)           \n\t"
-                "add %1, %0                     \n\t"
-                "cmp %3, %0                     \n\t"
-                " jb 1b                         \n\t"
-                : "+r" (ptr)
-                : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
-        );
-    }
-    else
-    {
-        __asm__ volatile(
-                "1:                             \n\t"
-                "movd (%0), %%mm0               \n\t"
-                "punpcklbw %%mm0, %%mm0         \n\t"
-                "punpcklwd %%mm0, %%mm0         \n\t"
-                "punpckldq %%mm0, %%mm0         \n\t"
-                "movq %%mm0, -8(%0)             \n\t"
-                "movq %%mm0, -16(%0)            \n\t"
-                "movq -8(%0, %2), %%mm1         \n\t"
-                "punpckhbw %%mm1, %%mm1         \n\t"
-                "punpckhwd %%mm1, %%mm1         \n\t"
-                "punpckhdq %%mm1, %%mm1         \n\t"
-                "movq %%mm1, (%0, %2)           \n\t"
-                "movq %%mm1, 8(%0, %2)          \n\t"
-                "add %1, %0                     \n\t"
-                "cmp %3, %0                     \n\t"
-                " jb 1b                         \n\t"
-                : "+r" (ptr)
-                : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
-        );
-    }
-
-    for(i=0;i<w;i+=4) {
-        /* top and bottom (and hopefully also the corners) */
-        ptr= buf - (i + 1) * wrap - w;
-        __asm__ volatile(
-                "1:                             \n\t"
-                "movq (%1, %0), %%mm0           \n\t"
-                "movq %%mm0, (%0)               \n\t"
-                "movq %%mm0, (%0, %2)           \n\t"
-                "movq %%mm0, (%0, %2, 2)        \n\t"
-                "movq %%mm0, (%0, %3)           \n\t"
-                "add $8, %0                     \n\t"
-                "cmp %4, %0                     \n\t"
-                " jb 1b                         \n\t"
-                : "+r" (ptr)
-                : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
-        );
-        ptr= last_line + (i + 1) * wrap - w;
-        __asm__ volatile(
-                "1:                             \n\t"
-                "movq (%1, %0), %%mm0           \n\t"
-                "movq %%mm0, (%0)               \n\t"
-                "movq %%mm0, (%0, %2)           \n\t"
-                "movq %%mm0, (%0, %2, 2)        \n\t"
-                "movq %%mm0, (%0, %3)           \n\t"
-                "add $8, %0                     \n\t"
-                "cmp %4, %0                     \n\t"
-                " jb 1b                         \n\t"
-                : "+r" (ptr)
-                : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
-        );
-    }
-}
-
-#define PAETH(cpu, abs3)\
-static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
-{\
-    x86_reg i = -bpp;\
-    x86_reg end = w-3;\
-    __asm__ volatile(\
-        "pxor      %%mm7, %%mm7 \n"\
-        "movd    (%1,%0), %%mm0 \n"\
-        "movd    (%2,%0), %%mm1 \n"\
-        "punpcklbw %%mm7, %%mm0 \n"\
-        "punpcklbw %%mm7, %%mm1 \n"\
-        "add       %4, %0 \n"\
-        "1: \n"\
-        "movq      %%mm1, %%mm2 \n"\
-        "movd    (%2,%0), %%mm1 \n"\
-        "movq      %%mm2, %%mm3 \n"\
-        "punpcklbw %%mm7, %%mm1 \n"\
-        "movq      %%mm2, %%mm4 \n"\
-        "psubw     %%mm1, %%mm3 \n"\
-        "psubw     %%mm0, %%mm4 \n"\
-        "movq      %%mm3, %%mm5 \n"\
-        "paddw     %%mm4, %%mm5 \n"\
-        abs3\
-        "movq      %%mm4, %%mm6 \n"\
-        "pminsw    %%mm5, %%mm6 \n"\
-        "pcmpgtw   %%mm6, %%mm3 \n"\
-        "pcmpgtw   %%mm5, %%mm4 \n"\
-        "movq      %%mm4, %%mm6 \n"\
-        "pand      %%mm3, %%mm4 \n"\
-        "pandn     %%mm3, %%mm6 \n"\
-        "pandn     %%mm0, %%mm3 \n"\
-        "movd    (%3,%0), %%mm0 \n"\
-        "pand      %%mm1, %%mm6 \n"\
-        "pand      %%mm4, %%mm2 \n"\
-        "punpcklbw %%mm7, %%mm0 \n"\
-        "movq      %6,    %%mm5 \n"\
-        "paddw     %%mm6, %%mm0 \n"\
-        "paddw     %%mm2, %%mm3 \n"\
-        "paddw     %%mm3, %%mm0 \n"\
-        "pand      %%mm5, %%mm0 \n"\
-        "movq      %%mm0, %%mm3 \n"\
-        "packuswb  %%mm3, %%mm3 \n"\
-        "movd      %%mm3, (%1,%0) \n"\
-        "add       %4, %0 \n"\
-        "cmp       %5, %0 \n"\
-        "jle 1b \n"\
-        :"+r"(i)\
-        :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
-         "m"(ff_pw_255)\
-        :"memory"\
-    );\
-}
-
-#define ABS3_MMX2\
-        "psubw     %%mm5, %%mm7 \n"\
-        "pmaxsw    %%mm7, %%mm5 \n"\
-        "pxor      %%mm6, %%mm6 \n"\
-        "pxor      %%mm7, %%mm7 \n"\
-        "psubw     %%mm3, %%mm6 \n"\
-        "psubw     %%mm4, %%mm7 \n"\
-        "pmaxsw    %%mm6, %%mm3 \n"\
-        "pmaxsw    %%mm7, %%mm4 \n"\
-        "pxor      %%mm7, %%mm7 \n"
-
-#define ABS3_SSSE3\
-        "pabsw     %%mm3, %%mm3 \n"\
-        "pabsw     %%mm4, %%mm4 \n"\
-        "pabsw     %%mm5, %%mm5 \n"
-
-PAETH(mmx2, ABS3_MMX2)
-#ifdef HAVE_SSSE3
-PAETH(ssse3, ABS3_SSSE3)
-#endif
-
-#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
-        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
-        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
-        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
-        "movq "#in7", " #m3 "             \n\t" /* d */\
-        "movq "#in0", %%mm5               \n\t" /* D */\
-        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
-        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
-        "movq "#in1", %%mm5               \n\t" /* C */\
-        "movq "#in2", %%mm6               \n\t" /* B */\
-        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
-        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
-        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
-        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
-        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
-        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
-        "psraw $5, %%mm5                  \n\t"\
-        "packuswb %%mm5, %%mm5            \n\t"\
-        OP(%%mm5, out, %%mm7, d)
-
-#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
-static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-    uint64_t temp;\
-\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7                \n\t"\
-        "1:                               \n\t"\
-        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
-        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
-        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
-        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
-        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
-        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
-        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
-        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
-        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
-        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
-        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
-        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
-        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
-        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
-        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
-        "paddw %%mm3, %%mm5               \n\t" /* b */\
-        "paddw %%mm2, %%mm6               \n\t" /* c */\
-        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
-        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
-        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
-        "paddw %%mm4, %%mm0               \n\t" /* a */\
-        "paddw %%mm1, %%mm5               \n\t" /* d */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
-        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
-        "paddw %6, %%mm6                  \n\t"\
-        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
-        "psraw $5, %%mm0                  \n\t"\
-        "movq %%mm0, %5                   \n\t"\
-        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
-        \
-        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
-        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
-        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
-        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
-        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
-        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
-        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
-        "paddw %%mm0, %%mm2               \n\t" /* b */\
-        "paddw %%mm5, %%mm3               \n\t" /* c */\
-        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
-        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
-        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
-        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
-        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
-        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
-        "paddw %%mm2, %%mm1               \n\t" /* a */\
-        "paddw %%mm6, %%mm4               \n\t" /* d */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
-        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
-        "paddw %6, %%mm1                  \n\t"\
-        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
-        "psraw $5, %%mm3                  \n\t"\
-        "movq %5, %%mm1                   \n\t"\
-        "packuswb %%mm3, %%mm1            \n\t"\
-        OP_MMX2(%%mm1, (%1),%%mm4, q)\
-        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
-        \
-        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
-        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
-        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
-        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
-        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
-        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
-        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
-        "paddw %%mm1, %%mm5               \n\t" /* b */\
-        "paddw %%mm4, %%mm0               \n\t" /* c */\
-        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
-        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
-        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
-        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
-        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
-        "paddw %%mm3, %%mm2               \n\t" /* d */\
-        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
-        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
-        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
-        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
-        "paddw %%mm2, %%mm6               \n\t" /* a */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
-        "paddw %6, %%mm0                  \n\t"\
-        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
-        "psraw $5, %%mm0                  \n\t"\
-        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
-        \
-        "paddw %%mm5, %%mm3               \n\t" /* a */\
-        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
-        "paddw %%mm4, %%mm6               \n\t" /* b */\
-        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
-        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
-        "paddw %%mm1, %%mm4               \n\t" /* c */\
-        "paddw %%mm2, %%mm5               \n\t" /* d */\
-        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
-        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
-        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
-        "paddw %6, %%mm4                  \n\t"\
-        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
-        "psraw $5, %%mm4                  \n\t"\
-        "packuswb %%mm4, %%mm0            \n\t"\
-        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
-        \
-        "add %3, %0                       \n\t"\
-        "add %4, %1                       \n\t"\
-        "decl %2                          \n\t"\
-        " jnz 1b                          \n\t"\
-        : "+a"(src), "+c"(dst), "+D"(h)\
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
-        : "memory"\
-    );\
-}\
-\
-static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-    int i;\
-    int16_t temp[16];\
-    /* quick HACK, XXX FIXME MUST be optimized */\
-    for(i=0; i<h; i++)\
-    {\
-        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
-        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
-        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
-        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
-        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
-        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
-        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
-        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
-        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
-        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
-        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
-        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
-        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
-        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
-        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
-        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
-        __asm__ volatile(\
-            "movq (%0), %%mm0               \n\t"\
-            "movq 8(%0), %%mm1              \n\t"\
-            "paddw %2, %%mm0                \n\t"\
-            "paddw %2, %%mm1                \n\t"\
-            "psraw $5, %%mm0                \n\t"\
-            "psraw $5, %%mm1                \n\t"\
-            "packuswb %%mm1, %%mm0          \n\t"\
-            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
-            "movq 16(%0), %%mm0             \n\t"\
-            "movq 24(%0), %%mm1             \n\t"\
-            "paddw %2, %%mm0                \n\t"\
-            "paddw %2, %%mm1                \n\t"\
-            "psraw $5, %%mm0                \n\t"\
-            "psraw $5, %%mm1                \n\t"\
-            "packuswb %%mm1, %%mm0          \n\t"\
-            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
-            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
-            : "memory"\
-        );\
-        dst+=dstStride;\
-        src+=srcStride;\
-    }\
-}\
-\
-static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7                \n\t"\
-        "1:                               \n\t"\
-        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
-        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
-        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
-        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
-        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
-        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
-        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
-        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
-        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
-        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
-        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
-        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
-        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
-        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
-        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
-        "paddw %%mm3, %%mm5               \n\t" /* b */\
-        "paddw %%mm2, %%mm6               \n\t" /* c */\
-        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
-        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
-        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
-        "paddw %%mm4, %%mm0               \n\t" /* a */\
-        "paddw %%mm1, %%mm5               \n\t" /* d */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
-        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
-        "paddw %5, %%mm6                  \n\t"\
-        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
-        "psraw $5, %%mm0                  \n\t"\
-        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
-        \
-        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
-        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
-        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
-        "paddw %%mm5, %%mm1               \n\t" /* a */\
-        "paddw %%mm6, %%mm2               \n\t" /* b */\
-        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
-        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
-        "paddw %%mm6, %%mm3               \n\t" /* c */\
-        "paddw %%mm5, %%mm4               \n\t" /* d */\
-        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
-        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
-        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
-        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
-        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
-        "paddw %5, %%mm1                  \n\t"\
-        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
-        "psraw $5, %%mm3                  \n\t"\
-        "packuswb %%mm3, %%mm0            \n\t"\
-        OP_MMX2(%%mm0, (%1), %%mm4, q)\
-        \
-        "add %3, %0                       \n\t"\
-        "add %4, %1                       \n\t"\
-        "decl %2                          \n\t"\
-        " jnz 1b                          \n\t"\
-        : "+a"(src), "+c"(dst), "+d"(h)\
-        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
-        : "memory"\
-    );\
-}\
-\
-static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-    int i;\
-    int16_t temp[8];\
-    /* quick HACK, XXX FIXME MUST be optimized */\
-    for(i=0; i<h; i++)\
-    {\
-        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
-        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
-        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
-        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
-        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
-        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
-        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
-        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
-        __asm__ volatile(\
-            "movq (%0), %%mm0           \n\t"\
-            "movq 8(%0), %%mm1          \n\t"\
-            "paddw %2, %%mm0            \n\t"\
-            "paddw %2, %%mm1            \n\t"\
-            "psraw $5, %%mm0            \n\t"\
-            "psraw $5, %%mm1            \n\t"\
-            "packuswb %%mm1, %%mm0      \n\t"\
-            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
-            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
-            :"memory"\
-        );\
-        dst+=dstStride;\
-        src+=srcStride;\
-    }\
-}
-
-#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
-\
-static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    uint64_t temp[17*4];\
-    uint64_t *temp_ptr= temp;\
-    int count= 17;\
-\
-    /*FIXME unroll */\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7              \n\t"\
-        "1:                             \n\t"\
-        "movq (%0), %%mm0               \n\t"\
-        "movq (%0), %%mm1               \n\t"\
-        "movq 8(%0), %%mm2              \n\t"\
-        "movq 8(%0), %%mm3              \n\t"\
-        "punpcklbw %%mm7, %%mm0         \n\t"\
-        "punpckhbw %%mm7, %%mm1         \n\t"\
-        "punpcklbw %%mm7, %%mm2         \n\t"\
-        "punpckhbw %%mm7, %%mm3         \n\t"\
-        "movq %%mm0, (%1)               \n\t"\
-        "movq %%mm1, 17*8(%1)           \n\t"\
-        "movq %%mm2, 2*17*8(%1)         \n\t"\
-        "movq %%mm3, 3*17*8(%1)         \n\t"\
-        "add $8, %1                     \n\t"\
-        "add %3, %0                     \n\t"\
-        "decl %2                        \n\t"\
-        " jnz 1b                        \n\t"\
-        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
-        : "r" ((x86_reg)srcStride)\
-        : "memory"\
-    );\
-    \
-    temp_ptr= temp;\
-    count=4;\
-    \
-/*FIXME reorder for speed */\
-    __asm__ volatile(\
-        /*"pxor %%mm7, %%mm7              \n\t"*/\
-        "1:                             \n\t"\
-        "movq (%0), %%mm0               \n\t"\
-        "movq 8(%0), %%mm1              \n\t"\
-        "movq 16(%0), %%mm2             \n\t"\
-        "movq 24(%0), %%mm3             \n\t"\
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
-        "add %4, %1                     \n\t"\
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
-        \
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
-        "add %4, %1                     \n\t"\
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
-        "add %4, %1                     \n\t"\
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
-        "add %4, %1                     \n\t"\
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
-        "add %4, %1                     \n\t"\
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
-        "add %4, %1                     \n\t"\
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
-        \
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
-        "add %4, %1                     \n\t"  \
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
-        \
-        "add $136, %0                   \n\t"\
-        "add %6, %1                     \n\t"\
-        "decl %2                        \n\t"\
-        " jnz 1b                        \n\t"\
-        \
-        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
-        : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
-        :"memory"\
-    );\
-}\
-\
-static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    uint64_t temp[9*2];\
-    uint64_t *temp_ptr= temp;\
-    int count= 9;\
-\
-    /*FIXME unroll */\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7              \n\t"\
-        "1:                             \n\t"\
-        "movq (%0), %%mm0               \n\t"\
-        "movq (%0), %%mm1               \n\t"\
-        "punpcklbw %%mm7, %%mm0         \n\t"\
-        "punpckhbw %%mm7, %%mm1         \n\t"\
-        "movq %%mm0, (%1)               \n\t"\
-        "movq %%mm1, 9*8(%1)            \n\t"\
-        "add $8, %1                     \n\t"\
-        "add %3, %0                     \n\t"\
-        "decl %2                        \n\t"\
-        " jnz 1b                        \n\t"\
-        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
-        : "r" ((x86_reg)srcStride)\
-        : "memory"\
-    );\
-    \
-    temp_ptr= temp;\
-    count=2;\
-    \
-/*FIXME reorder for speed */\
-    __asm__ volatile(\
-        /*"pxor %%mm7, %%mm7              \n\t"*/\
-        "1:                             \n\t"\
-        "movq (%0), %%mm0               \n\t"\
-        "movq 8(%0), %%mm1              \n\t"\
-        "movq 16(%0), %%mm2             \n\t"\
-        "movq 24(%0), %%mm3             \n\t"\
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
-        "add %4, %1                     \n\t"\
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
-        \
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
-        "add %4, %1                     \n\t"\
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
-        \
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
-        "add %4, %1                     \n\t"\
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
-                \
-        "add $72, %0                    \n\t"\
-        "add %6, %1                     \n\t"\
-        "decl %2                        \n\t"\
-        " jnz 1b                        \n\t"\
-         \
-        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
-        : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
-        : "memory"\
-   );\
-}\
-\
-static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
-}\
-\
-static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
-}\
-static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
-}\
-static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[8 + 9];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
-}\
-static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[9];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
-}\
-static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[32];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[32];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[32];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
-}\
-\
-static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[32];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
-}\
-static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[16*2 + 17*2];\
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
-    uint8_t * const halfHV= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[17*2];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
-}\
-static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[17*2];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
-}\
-static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t half[17*2];\
-    uint8_t * const halfH= ((uint8_t*)half);\
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
-}
-
-#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
-#define AVG_3DNOW_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "   \n\t"\
-"pavgusb " #temp ", " #a "        \n\t"\
-"mov" #size " " #a ", " #b "      \n\t"
-#define AVG_MMX2_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "   \n\t"\
-"pavgb " #temp ", " #a "          \n\t"\
-"mov" #size " " #a ", " #b "      \n\t"
-
-QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
-QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
-QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
-QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
-QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
-QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
-QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
-QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
-QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
-
-/***********************************/
-/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
-
-#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
-}
-#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
-}
-
-#define QPEL_2TAP(OPNAME, SIZE, MMX)\
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
-                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
-                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
-                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
-}\
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
-}\
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
-
-QPEL_2TAP(put_, 16, mmx2)
-QPEL_2TAP(avg_, 16, mmx2)
-QPEL_2TAP(put_,  8, mmx2)
-QPEL_2TAP(avg_,  8, mmx2)
-QPEL_2TAP(put_, 16, 3dnow)
-QPEL_2TAP(avg_, 16, 3dnow)
-QPEL_2TAP(put_,  8, 3dnow)
-QPEL_2TAP(avg_,  8, 3dnow)
-
-
-#if 0
-static void just_return() { return; }
-#endif
-
-static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
-                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
-    const int w = 8;
-    const int ix = ox>>(16+shift);
-    const int iy = oy>>(16+shift);
-    const int oxs = ox>>4;
-    const int oys = oy>>4;
-    const int dxxs = dxx>>4;
-    const int dxys = dxy>>4;
-    const int dyxs = dyx>>4;
-    const int dyys = dyy>>4;
-    const uint16_t r4[4] = {r,r,r,r};
-    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
-    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
-    const uint64_t shift2 = 2*shift;
-    uint8_t edge_buf[(h+1)*stride];
-    int x, y;
-
-    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
-    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
-    const int dxh = dxy*(h-1);
-    const int dyw = dyx*(w-1);
-    if( // non-constant fullpel offset (3% of blocks)
-        ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
-         (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
-        // uses more than 16 bits of subpel mv (only at huge resolution)
-        || (dxx|dxy|dyx|dyy)&15 )
-    {
-        //FIXME could still use mmx for some of the rows
-        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
-        return;
-    }
-
-    src += ix + iy*stride;
-    if( (unsigned)ix >= width-w ||
-        (unsigned)iy >= height-h )
-    {
-        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
-        src = edge_buf;
-    }
-
-    __asm__ volatile(
-        "movd         %0, %%mm6 \n\t"
-        "pxor      %%mm7, %%mm7 \n\t"
-        "punpcklwd %%mm6, %%mm6 \n\t"
-        "punpcklwd %%mm6, %%mm6 \n\t"
-        :: "r"(1<<shift)
-    );
-
-    for(x=0; x<w; x+=4){
-        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
-                            oxs - dxys + dxxs*(x+1),
-                            oxs - dxys + dxxs*(x+2),
-                            oxs - dxys + dxxs*(x+3) };
-        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
-                            oys - dyys + dyxs*(x+1),
-                            oys - dyys + dyxs*(x+2),
-                            oys - dyys + dyxs*(x+3) };
-
-        for(y=0; y<h; y++){
-            __asm__ volatile(
-                "movq   %0,  %%mm4 \n\t"
-                "movq   %1,  %%mm5 \n\t"
-                "paddw  %2,  %%mm4 \n\t"
-                "paddw  %3,  %%mm5 \n\t"
-                "movq   %%mm4, %0  \n\t"
-                "movq   %%mm5, %1  \n\t"
-                "psrlw  $12, %%mm4 \n\t"
-                "psrlw  $12, %%mm5 \n\t"
-                : "+m"(*dx4), "+m"(*dy4)
-                : "m"(*dxy4), "m"(*dyy4)
-            );
-
-            __asm__ volatile(
-                "movq   %%mm6, %%mm2 \n\t"
-                "movq   %%mm6, %%mm1 \n\t"
-                "psubw  %%mm4, %%mm2 \n\t"
-                "psubw  %%mm5, %%mm1 \n\t"
-                "movq   %%mm2, %%mm0 \n\t"
-                "movq   %%mm4, %%mm3 \n\t"
-                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
-                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
-                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
-                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
-
-                "movd   %4,    %%mm5 \n\t"
-                "movd   %3,    %%mm4 \n\t"
-                "punpcklbw %%mm7, %%mm5 \n\t"
-                "punpcklbw %%mm7, %%mm4 \n\t"
-                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
-                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
-
-                "movd   %2,    %%mm5 \n\t"
-                "movd   %1,    %%mm4 \n\t"
-                "punpcklbw %%mm7, %%mm5 \n\t"
-                "punpcklbw %%mm7, %%mm4 \n\t"
-                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
-                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
-                "paddw  %5,    %%mm1 \n\t"
-                "paddw  %%mm3, %%mm2 \n\t"
-                "paddw  %%mm1, %%mm0 \n\t"
-                "paddw  %%mm2, %%mm0 \n\t"
-
-                "psrlw    %6,    %%mm0 \n\t"
-                "packuswb %%mm0, %%mm0 \n\t"
-                "movd     %%mm0, %0    \n\t"
-
-                : "=m"(dst[x+y*stride])
-                : "m"(src[0]), "m"(src[1]),
-                  "m"(src[stride]), "m"(src[stride+1]),
-                  "m"(*r4), "m"(shift2)
-            );
-            src += stride;
-        }
-        src += 4-h*stride;
-    }
-}
-
-#define PREFETCH(name, op) \
-static void name(void *mem, int stride, int h){\
-    const uint8_t *p= mem;\
-    do{\
-        __asm__ volatile(#op" %0" :: "m"(*p));\
-        p+= stride;\
-    }while(--h);\
-}
-PREFETCH(prefetch_mmx2,  prefetcht0)
-PREFETCH(prefetch_3dnow, prefetch)
-#undef PREFETCH
-
-#include "h264dsp_mmx.c"
-
-/* CAVS specific */
-void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
-void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx);
-
-void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
-    put_pixels8_mmx(dst, src, stride, 8);
-}
-void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
-    avg_pixels8_mmx(dst, src, stride, 8);
-}
-void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
-    put_pixels16_mmx(dst, src, stride, 16);
-}
-void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
-    avg_pixels16_mmx(dst, src, stride, 16);
-}
-
-/* VC1 specific */
-void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
-
-void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
-    put_pixels8_mmx(dst, src, stride, 8);
-}
-
-/* external functions, from idct_mmx.c */
-void ff_mmx_idct(DCTELEM *block);
-void ff_mmxext_idct(DCTELEM *block);
-
-/* XXX: those functions should be suppressed ASAP when all IDCTs are
-   converted */
-#ifdef CONFIG_GPL
-static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_mmx_idct (block);
-    put_pixels_clamped_mmx(block, dest, line_size);
-}
-static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_mmx_idct (block);
-    add_pixels_clamped_mmx(block, dest, line_size);
-}
-static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_mmxext_idct (block);
-    put_pixels_clamped_mmx(block, dest, line_size);
-}
-static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_mmxext_idct (block);
-    add_pixels_clamped_mmx(block, dest, line_size);
-}
-#endif
-static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_idct_xvid_mmx (block);
-    put_pixels_clamped_mmx(block, dest, line_size);
-}
-static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_idct_xvid_mmx (block);
-    add_pixels_clamped_mmx(block, dest, line_size);
-}
-static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_idct_xvid_mmx2 (block);
-    put_pixels_clamped_mmx(block, dest, line_size);
-}
-static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    ff_idct_xvid_mmx2 (block);
-    add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
-{
-    int i;
-    __asm__ volatile("pxor %%mm7, %%mm7":);
-    for(i=0; i<blocksize; i+=2) {
-        __asm__ volatile(
-            "movq    %0,    %%mm0 \n\t"
-            "movq    %1,    %%mm1 \n\t"
-            "movq    %%mm0, %%mm2 \n\t"
-            "movq    %%mm1, %%mm3 \n\t"
-            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
-            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
-            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
-            "pxor    %%mm2, %%mm1 \n\t"
-            "movq    %%mm3, %%mm4 \n\t"
-            "pand    %%mm1, %%mm3 \n\t"
-            "pandn   %%mm1, %%mm4 \n\t"
-            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
-            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
-            "movq    %%mm3, %1    \n\t"
-            "movq    %%mm0, %0    \n\t"
-            :"+m"(mag[i]), "+m"(ang[i])
-            ::"memory"
-        );
-    }
-    __asm__ volatile("femms");
-}
-static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
-{
-    int i;
-
-    __asm__ volatile(
-            "movaps  %0,     %%xmm5 \n\t"
-        ::"m"(ff_pdw_80000000[0])
-    );
-    for(i=0; i<blocksize; i+=4) {
-        __asm__ volatile(
-            "movaps  %0,     %%xmm0 \n\t"
-            "movaps  %1,     %%xmm1 \n\t"
-            "xorps   %%xmm2, %%xmm2 \n\t"
-            "xorps   %%xmm3, %%xmm3 \n\t"
-            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
-            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
-            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
-            "xorps   %%xmm2, %%xmm1 \n\t"
-            "movaps  %%xmm3, %%xmm4 \n\t"
-            "andps   %%xmm1, %%xmm3 \n\t"
-            "andnps  %%xmm1, %%xmm4 \n\t"
-            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
-            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
-            "movaps  %%xmm3, %1     \n\t"
-            "movaps  %%xmm0, %0     \n\t"
-            :"+m"(mag[i]), "+m"(ang[i])
-            ::"memory"
-        );
-    }
-}
-
-#define IF1(x) x
-#define IF0(x)
-
-#define MIX5(mono,stereo)\
-    __asm__ volatile(\
-        "movss          0(%2), %%xmm5 \n"\
-        "movss          8(%2), %%xmm6 \n"\
-        "movss         24(%2), %%xmm7 \n"\
-        "shufps    $0, %%xmm5, %%xmm5 \n"\
-        "shufps    $0, %%xmm6, %%xmm6 \n"\
-        "shufps    $0, %%xmm7, %%xmm7 \n"\
-        "1: \n"\
-        "movaps       (%0,%1), %%xmm0 \n"\
-        "movaps  0x400(%0,%1), %%xmm1 \n"\
-        "movaps  0x800(%0,%1), %%xmm2 \n"\
-        "movaps  0xc00(%0,%1), %%xmm3 \n"\
-        "movaps 0x1000(%0,%1), %%xmm4 \n"\
-        "mulps         %%xmm5, %%xmm0 \n"\
-        "mulps         %%xmm6, %%xmm1 \n"\
-        "mulps         %%xmm5, %%xmm2 \n"\
-        "mulps         %%xmm7, %%xmm3 \n"\
-        "mulps         %%xmm7, %%xmm4 \n"\
- stereo("addps         %%xmm1, %%xmm0 \n")\
-        "addps         %%xmm1, %%xmm2 \n"\
-        "addps         %%xmm3, %%xmm0 \n"\
-        "addps         %%xmm4, %%xmm2 \n"\
-   mono("addps         %%xmm2, %%xmm0 \n")\
-        "movaps  %%xmm0,      (%0,%1) \n"\
- stereo("movaps  %%xmm2, 0x400(%0,%1) \n")\
-        "add $16, %0 \n"\
-        "jl 1b \n"\
-        :"+&r"(i)\
-        :"r"(samples[0]+len), "r"(matrix)\
-        :"memory"\
-    );
-
-#define MIX_MISC(stereo)\
-    __asm__ volatile(\
-        "1: \n"\
-        "movaps  (%3,%0), %%xmm0 \n"\
- stereo("movaps   %%xmm0, %%xmm1 \n")\
-        "mulps    %%xmm6, %%xmm0 \n"\
- stereo("mulps    %%xmm7, %%xmm1 \n")\
-        "lea 1024(%3,%0), %1 \n"\
-        "mov %5, %2 \n"\
-        "2: \n"\
-        "movaps   (%1),   %%xmm2 \n"\
- stereo("movaps   %%xmm2, %%xmm3 \n")\
-        "mulps   (%4,%2), %%xmm2 \n"\
- stereo("mulps 16(%4,%2), %%xmm3 \n")\
-        "addps    %%xmm2, %%xmm0 \n"\
- stereo("addps    %%xmm3, %%xmm1 \n")\
-        "add $1024, %1 \n"\
-        "add $32, %2 \n"\
-        "jl 2b \n"\
-        "movaps   %%xmm0,     (%3,%0) \n"\
- stereo("movaps   %%xmm1, 1024(%3,%0) \n")\
-        "add $16, %0 \n"\
-        "jl 1b \n"\
-        :"+&r"(i), "=&r"(j), "=&r"(k)\
-        :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
-        :"memory"\
-    );
-
-static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
-{
-    int (*matrix_cmp)[2] = (int(*)[2])matrix;
-    intptr_t i,j,k;
-
-    i = -len*sizeof(float);
-    if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
-        MIX5(IF0,IF1);
-    } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
-        MIX5(IF1,IF0);
-    } else {
-        DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]);
-        j = 2*in_ch*sizeof(float);
-        __asm__ volatile(
-            "1: \n"
-            "sub $8, %0 \n"
-            "movss     (%2,%0), %%xmm6 \n"
-            "movss    4(%2,%0), %%xmm7 \n"
-            "shufps $0, %%xmm6, %%xmm6 \n"
-            "shufps $0, %%xmm7, %%xmm7 \n"
-            "movaps %%xmm6,   (%1,%0,4) \n"
-            "movaps %%xmm7, 16(%1,%0,4) \n"
-            "jg 1b \n"
-            :"+&r"(j)
-            :"r"(matrix_simd), "r"(matrix)
-            :"memory"
-        );
-        if(out_ch == 2) {
-            MIX_MISC(IF1);
-        } else {
-            MIX_MISC(IF0);
-        }
-    }
-}
-
-static void vector_fmul_3dnow(float *dst, const float *src, int len){
-    x86_reg i = (len-4)*4;
-    __asm__ volatile(
-        "1: \n\t"
-        "movq    (%1,%0), %%mm0 \n\t"
-        "movq   8(%1,%0), %%mm1 \n\t"
-        "pfmul   (%2,%0), %%mm0 \n\t"
-        "pfmul  8(%2,%0), %%mm1 \n\t"
-        "movq   %%mm0,  (%1,%0) \n\t"
-        "movq   %%mm1, 8(%1,%0) \n\t"
-        "sub  $16, %0 \n\t"
-        "jge 1b \n\t"
-        "femms  \n\t"
-        :"+r"(i)
-        :"r"(dst), "r"(src)
-        :"memory"
-    );
-}
-static void vector_fmul_sse(float *dst, const float *src, int len){
-    x86_reg i = (len-8)*4;
-    __asm__ volatile(
-        "1: \n\t"
-        "movaps    (%1,%0), %%xmm0 \n\t"
-        "movaps  16(%1,%0), %%xmm1 \n\t"
-        "mulps     (%2,%0), %%xmm0 \n\t"
-        "mulps   16(%2,%0), %%xmm1 \n\t"
-        "movaps  %%xmm0,   (%1,%0) \n\t"
-        "movaps  %%xmm1, 16(%1,%0) \n\t"
-        "sub  $32, %0 \n\t"
-        "jge 1b \n\t"
-        :"+r"(i)
-        :"r"(dst), "r"(src)
-        :"memory"
-    );
-}
-
-static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
-    x86_reg i = len*4-16;
-    __asm__ volatile(
-        "1: \n\t"
-        "pswapd   8(%1), %%mm0 \n\t"
-        "pswapd    (%1), %%mm1 \n\t"
-        "pfmul  (%3,%0), %%mm0 \n\t"
-        "pfmul 8(%3,%0), %%mm1 \n\t"
-        "movq  %%mm0,  (%2,%0) \n\t"
-        "movq  %%mm1, 8(%2,%0) \n\t"
-        "add   $16, %1 \n\t"
-        "sub   $16, %0 \n\t"
-        "jge   1b \n\t"
-        :"+r"(i), "+r"(src1)
-        :"r"(dst), "r"(src0)
-    );
-    __asm__ volatile("femms");
-}
-static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
-    x86_reg i = len*4-32;
-    __asm__ volatile(
-        "1: \n\t"
-        "movaps        16(%1), %%xmm0 \n\t"
-        "movaps          (%1), %%xmm1 \n\t"
-        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
-        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
-        "mulps        (%3,%0), %%xmm0 \n\t"
-        "mulps      16(%3,%0), %%xmm1 \n\t"
-        "movaps     %%xmm0,   (%2,%0) \n\t"
-        "movaps     %%xmm1, 16(%2,%0) \n\t"
-        "add    $32, %1 \n\t"
-        "sub    $32, %0 \n\t"
-        "jge    1b \n\t"
-        :"+r"(i), "+r"(src1)
-        :"r"(dst), "r"(src0)
-    );
-}
-
-static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
-                                      const float *src2, int src3, int len, int step){
-    x86_reg i = (len-4)*4;
-    if(step == 2 && src3 == 0){
-        dst += (len-4)*2;
-        __asm__ volatile(
-            "1: \n\t"
-            "movq   (%2,%0),  %%mm0 \n\t"
-            "movq  8(%2,%0),  %%mm1 \n\t"
-            "pfmul  (%3,%0),  %%mm0 \n\t"
-            "pfmul 8(%3,%0),  %%mm1 \n\t"
-            "pfadd  (%4,%0),  %%mm0 \n\t"
-            "pfadd 8(%4,%0),  %%mm1 \n\t"
-            "movd     %%mm0,   (%1) \n\t"
-            "movd     %%mm1, 16(%1) \n\t"
-            "psrlq      $32,  %%mm0 \n\t"
-            "psrlq      $32,  %%mm1 \n\t"
-            "movd     %%mm0,  8(%1) \n\t"
-            "movd     %%mm1, 24(%1) \n\t"
-            "sub  $32, %1 \n\t"
-            "sub  $16, %0 \n\t"
-            "jge  1b \n\t"
-            :"+r"(i), "+r"(dst)
-            :"r"(src0), "r"(src1), "r"(src2)
-            :"memory"
-        );
-    }
-    else if(step == 1 && src3 == 0){
-        __asm__ volatile(
-            "1: \n\t"
-            "movq    (%2,%0), %%mm0 \n\t"
-            "movq   8(%2,%0), %%mm1 \n\t"
-            "pfmul   (%3,%0), %%mm0 \n\t"
-            "pfmul  8(%3,%0), %%mm1 \n\t"
-            "pfadd   (%4,%0), %%mm0 \n\t"
-            "pfadd  8(%4,%0), %%mm1 \n\t"
-            "movq  %%mm0,   (%1,%0) \n\t"
-            "movq  %%mm1,  8(%1,%0) \n\t"
-            "sub  $16, %0 \n\t"
-            "jge  1b \n\t"
-            :"+r"(i)
-            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
-            :"memory"
-        );
-    }
-    else
-        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
-    __asm__ volatile("femms");
-}
-static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
-                                    const float *src2, int src3, int len, int step){
-    x86_reg i = (len-8)*4;
-    if(step == 2 && src3 == 0){
-        dst += (len-8)*2;
-        __asm__ volatile(
-            "1: \n\t"
-            "movaps   (%2,%0), %%xmm0 \n\t"
-            "movaps 16(%2,%0), %%xmm1 \n\t"
-            "mulps    (%3,%0), %%xmm0 \n\t"
-            "mulps  16(%3,%0), %%xmm1 \n\t"
-            "addps    (%4,%0), %%xmm0 \n\t"
-            "addps  16(%4,%0), %%xmm1 \n\t"
-            "movss     %%xmm0,   (%1) \n\t"
-            "movss     %%xmm1, 32(%1) \n\t"
-            "movhlps   %%xmm0, %%xmm2 \n\t"
-            "movhlps   %%xmm1, %%xmm3 \n\t"
-            "movss     %%xmm2, 16(%1) \n\t"
-            "movss     %%xmm3, 48(%1) \n\t"
-            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
-            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
-            "movss     %%xmm0,  8(%1) \n\t"
-            "movss     %%xmm1, 40(%1) \n\t"
-            "movhlps   %%xmm0, %%xmm2 \n\t"
-            "movhlps   %%xmm1, %%xmm3 \n\t"
-            "movss     %%xmm2, 24(%1) \n\t"
-            "movss     %%xmm3, 56(%1) \n\t"
-            "sub  $64, %1 \n\t"
-            "sub  $32, %0 \n\t"
-            "jge  1b \n\t"
-            :"+r"(i), "+r"(dst)
-            :"r"(src0), "r"(src1), "r"(src2)
-            :"memory"
-        );
-    }
-    else if(step == 1 && src3 == 0){
-        __asm__ volatile(
-            "1: \n\t"
-            "movaps   (%2,%0), %%xmm0 \n\t"
-            "movaps 16(%2,%0), %%xmm1 \n\t"
-            "mulps    (%3,%0), %%xmm0 \n\t"
-            "mulps  16(%3,%0), %%xmm1 \n\t"
-            "addps    (%4,%0), %%xmm0 \n\t"
-            "addps  16(%4,%0), %%xmm1 \n\t"
-            "movaps %%xmm0,   (%1,%0) \n\t"
-            "movaps %%xmm1, 16(%1,%0) \n\t"
-            "sub  $32, %0 \n\t"
-            "jge  1b \n\t"
-            :"+r"(i)
-            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
-            :"memory"
-        );
-    }
-    else
-        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
-}
-
-static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
-                                      const float *win, float add_bias, int len){
-#ifdef HAVE_6REGS
-    if(add_bias == 0){
-        x86_reg i = -len*4;
-        x86_reg j = len*4-8;
-        __asm__ volatile(
-            "1: \n"
-            "pswapd  (%5,%1), %%mm1 \n"
-            "movq    (%5,%0), %%mm0 \n"
-            "pswapd  (%4,%1), %%mm5 \n"
-            "movq    (%3,%0), %%mm4 \n"
-            "movq      %%mm0, %%mm2 \n"
-            "movq      %%mm1, %%mm3 \n"
-            "pfmul     %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
-            "pfmul     %%mm5, %%mm3 \n" // src1[    j]*win[len+j]
-            "pfmul     %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
-            "pfmul     %%mm5, %%mm0 \n" // src1[    j]*win[len+i]
-            "pfadd     %%mm3, %%mm2 \n"
-            "pfsub     %%mm0, %%mm1 \n"
-            "pswapd    %%mm2, %%mm2 \n"
-            "movq      %%mm1, (%2,%0) \n"
-            "movq      %%mm2, (%2,%1) \n"
-            "sub $8, %1 \n"
-            "add $8, %0 \n"
-            "jl 1b \n"
-            "femms \n"
-            :"+r"(i), "+r"(j)
-            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
-        );
-    }else
-#endif
-        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
-}
-
-static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
-                                   const float *win, float add_bias, int len){
-#ifdef HAVE_6REGS
-    if(add_bias == 0){
-        x86_reg i = -len*4;
-        x86_reg j = len*4-16;
-        __asm__ volatile(
-            "1: \n"
-            "movaps       (%5,%1), %%xmm1 \n"
-            "movaps       (%5,%0), %%xmm0 \n"
-            "movaps       (%4,%1), %%xmm5 \n"
-            "movaps       (%3,%0), %%xmm4 \n"
-            "shufps $0x1b, %%xmm1, %%xmm1 \n"
-            "shufps $0x1b, %%xmm5, %%xmm5 \n"
-            "movaps        %%xmm0, %%xmm2 \n"
-            "movaps        %%xmm1, %%xmm3 \n"
-            "mulps         %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
-            "mulps         %%xmm5, %%xmm3 \n" // src1[    j]*win[len+j]
-            "mulps         %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
-            "mulps         %%xmm5, %%xmm0 \n" // src1[    j]*win[len+i]
-            "addps         %%xmm3, %%xmm2 \n"
-            "subps         %%xmm0, %%xmm1 \n"
-            "shufps $0x1b, %%xmm2, %%xmm2 \n"
-            "movaps        %%xmm1, (%2,%0) \n"
-            "movaps        %%xmm2, (%2,%1) \n"
-            "sub $16, %1 \n"
-            "add $16, %0 \n"
-            "jl 1b \n"
-            :"+r"(i), "+r"(j)
-            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
-        );
-    }else
-#endif
-        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
-}
-
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtpi2ps   (%2,%0), %%xmm0 \n"
-        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
-        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
-        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
-        "movlhps  %%xmm1,    %%xmm0 \n"
-        "movlhps  %%xmm3,    %%xmm2 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm2 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm2, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtdq2ps   (%2,%0), %%xmm0 \n"
-        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm1 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm1, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    // not bit-exact: pf2id uses different rounding than C and SSE
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
-        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
-        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
-        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
-        "packssdw   %%mm1       , %%mm0     \n\t"
-        "packssdw   %%mm3       , %%mm2     \n\t"
-        "movq       %%mm0       ,  (%1,%0)  \n\t"
-        "movq       %%mm2       , 8(%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        "femms                              \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
-        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
-        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
-        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
-        "packssdw   %%mm1       , %%mm0     \n\t"
-        "packssdw   %%mm3       , %%mm2     \n\t"
-        "movq       %%mm0       ,  (%1,%0)  \n\t"
-        "movq       %%mm2       , 8(%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        "emms                               \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
-
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
-        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
-        "packssdw   %%xmm1      , %%xmm0    \n\t"
-        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
-
-#ifdef HAVE_YASM
-void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
-#else
-#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#endif
-#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
-
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
-/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
-static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
-    DECLARE_ALIGNED_16(int16_t, tmp[len]);\
-    int i,j,c;\
-    for(c=0; c<channels; c++){\
-        float_to_int16_##cpu(tmp, src[c], len);\
-        for(i=0, j=c; i<len; i++, j+=channels)\
-            dst[j] = tmp[i];\
-    }\
-}\
-\
-static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
-    if(channels==1)\
-        float_to_int16_##cpu(dst, src[0], len);\
-    else if(channels==2){\
-        x86_reg reglen = len; \
-        const float *src0 = src[0];\
-        const float *src1 = src[1];\
-        __asm__ volatile(\
-            "shl $2, %0 \n"\
-            "add %0, %1 \n"\
-            "add %0, %2 \n"\
-            "add %0, %3 \n"\
-            "neg %0 \n"\
-            body\
-            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
-        );\
-    }else if(channels==6){\
-        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
-    }else\
-        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
-}
-
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
-    "1:                         \n"
-    "pf2id     (%2,%0), %%mm0   \n"
-    "pf2id    8(%2,%0), %%mm1   \n"
-    "pf2id     (%3,%0), %%mm2   \n"
-    "pf2id    8(%3,%0), %%mm3   \n"
-    "packssdw    %%mm1, %%mm0   \n"
-    "packssdw    %%mm3, %%mm2   \n"
-    "movq        %%mm0, %%mm1   \n"
-    "punpcklwd   %%mm2, %%mm0   \n"
-    "punpckhwd   %%mm2, %%mm1   \n"
-    "movq        %%mm0,  (%1,%0)\n"
-    "movq        %%mm1, 8(%1,%0)\n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-    "femms                      \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse,
-    "1:                         \n"
-    "cvtps2pi  (%2,%0), %%mm0   \n"
-    "cvtps2pi 8(%2,%0), %%mm1   \n"
-    "cvtps2pi  (%3,%0), %%mm2   \n"
-    "cvtps2pi 8(%3,%0), %%mm3   \n"
-    "packssdw    %%mm1, %%mm0   \n"
-    "packssdw    %%mm3, %%mm2   \n"
-    "movq        %%mm0, %%mm1   \n"
-    "punpcklwd   %%mm2, %%mm0   \n"
-    "punpckhwd   %%mm2, %%mm1   \n"
-    "movq        %%mm0,  (%1,%0)\n"
-    "movq        %%mm1, 8(%1,%0)\n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-    "emms                       \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse2,
-    "1:                         \n"
-    "cvtps2dq  (%2,%0), %%xmm0  \n"
-    "cvtps2dq  (%3,%0), %%xmm1  \n"
-    "packssdw   %%xmm1, %%xmm0  \n"
-    "movhlps    %%xmm0, %%xmm1  \n"
-    "punpcklwd  %%xmm1, %%xmm0  \n"
-    "movdqa     %%xmm0, (%1,%0) \n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-)
-
-static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
-    if(channels==6)
-        ff_float_to_int16_interleave6_3dn2(dst, src, len);
-    else
-        float_to_int16_interleave_3dnow(dst, src, len, channels);
-}
-
-
-extern void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
-extern void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
-extern void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
-extern void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
-extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
-                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
-extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
-                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
-
-
-static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
-    x86_reg o = -(order << 1);
-    v1 += order;
-    v2 += order;
-    __asm__ volatile(
-        "1:                          \n\t"
-        "movdqu   (%1,%2),   %%xmm0  \n\t"
-        "movdqu 16(%1,%2),   %%xmm1  \n\t"
-        "paddw    (%0,%2),   %%xmm0  \n\t"
-        "paddw  16(%0,%2),   %%xmm1  \n\t"
-        "movdqa   %%xmm0,    (%0,%2) \n\t"
-        "movdqa   %%xmm1,  16(%0,%2) \n\t"
-        "add      $32,       %2      \n\t"
-        "js       1b                 \n\t"
-        : "+r"(v1), "+r"(v2), "+r"(o)
-    );
-}
-
-static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
-    x86_reg o = -(order << 1);
-    v1 += order;
-    v2 += order;
-    __asm__ volatile(
-        "1:                           \n\t"
-        "movdqa    (%0,%2),   %%xmm0  \n\t"
-        "movdqa  16(%0,%2),   %%xmm2  \n\t"
-        "movdqu    (%1,%2),   %%xmm1  \n\t"
-        "movdqu  16(%1,%2),   %%xmm3  \n\t"
-        "psubw     %%xmm1,    %%xmm0  \n\t"
-        "psubw     %%xmm3,    %%xmm2  \n\t"
-        "movdqa    %%xmm0,    (%0,%2) \n\t"
-        "movdqa    %%xmm2,  16(%0,%2) \n\t"
-        "add       $32,       %2      \n\t"
-        "js        1b                 \n\t"
-        : "+r"(v1), "+r"(v2), "+r"(o)
-    );
-}
-
-static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
-{
-    int res = 0;
-    DECLARE_ALIGNED_16(int64_t, sh);
-    x86_reg o = -(order << 1);
-
-    v1 += order;
-    v2 += order;
-    sh = shift;
-    __asm__ volatile(
-        "pxor      %%xmm7,  %%xmm7        \n\t"
-        "1:                               \n\t"
-        "movdqu    (%0,%3), %%xmm0        \n\t"
-        "movdqu  16(%0,%3), %%xmm1        \n\t"
-        "pmaddwd   (%1,%3), %%xmm0        \n\t"
-        "pmaddwd 16(%1,%3), %%xmm1        \n\t"
-        "paddd     %%xmm0,  %%xmm7        \n\t"
-        "paddd     %%xmm1,  %%xmm7        \n\t"
-        "add       $32,     %3            \n\t"
-        "js        1b                     \n\t"
-        "movhlps   %%xmm7,  %%xmm2        \n\t"
-        "paddd     %%xmm2,  %%xmm7        \n\t"
-        "psrad     %4,      %%xmm7        \n\t"
-        "pshuflw   $0x4E,   %%xmm7,%%xmm2 \n\t"
-        "paddd     %%xmm2,  %%xmm7        \n\t"
-        "movd      %%xmm7,  %2            \n\t"
-        : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
-        : "m"(sh)
-    );
-    return res;
-}
-
-void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
-{
-    mm_flags = mm_support();
-
-    if (avctx->dsp_mask) {
-        if (avctx->dsp_mask & FF_MM_FORCE)
-            mm_flags |= (avctx->dsp_mask & 0xffff);
-        else
-            mm_flags &= ~(avctx->dsp_mask & 0xffff);
-    }
-
-#if 0
-    av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
-    if (mm_flags & FF_MM_MMX)
-        av_log(avctx, AV_LOG_INFO, " mmx");
-    if (mm_flags & FF_MM_MMXEXT)
-        av_log(avctx, AV_LOG_INFO, " mmxext");
-    if (mm_flags & FF_MM_3DNOW)
-        av_log(avctx, AV_LOG_INFO, " 3dnow");
-    if (mm_flags & FF_MM_SSE)
-        av_log(avctx, AV_LOG_INFO, " sse");
-    if (mm_flags & FF_MM_SSE2)
-        av_log(avctx, AV_LOG_INFO, " sse2");
-    av_log(avctx, AV_LOG_INFO, "\n");
-#endif
-
-    if (mm_flags & FF_MM_MMX) {
-        const int idct_algo= avctx->idct_algo;
-
-        if(avctx->lowres==0){
-            if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
-                c->idct_put= ff_simple_idct_put_mmx;
-                c->idct_add= ff_simple_idct_add_mmx;
-                c->idct    = ff_simple_idct_mmx;
-                c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
-#ifdef CONFIG_GPL
-            }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
-                if(mm_flags & FF_MM_MMXEXT){
-                    c->idct_put= ff_libmpeg2mmx2_idct_put;
-                    c->idct_add= ff_libmpeg2mmx2_idct_add;
-                    c->idct    = ff_mmxext_idct;
-                }else{
-                    c->idct_put= ff_libmpeg2mmx_idct_put;
-                    c->idct_add= ff_libmpeg2mmx_idct_add;
-                    c->idct    = ff_mmx_idct;
-                }
-                c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
-#endif
-            }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER) &&
-                     idct_algo==FF_IDCT_VP3){
-                if(mm_flags & FF_MM_SSE2){
-                    c->idct_put= ff_vp3_idct_put_sse2;
-                    c->idct_add= ff_vp3_idct_add_sse2;
-                    c->idct    = ff_vp3_idct_sse2;
-                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
-                }else{
-                    c->idct_put= ff_vp3_idct_put_mmx;
-                    c->idct_add= ff_vp3_idct_add_mmx;
-                    c->idct    = ff_vp3_idct_mmx;
-                    c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
-                }
-            }else if(idct_algo==FF_IDCT_CAVS){
-                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
-            }else if(idct_algo==FF_IDCT_XVIDMMX){
-                if(mm_flags & FF_MM_SSE2){
-                    c->idct_put= ff_idct_xvid_sse2_put;
-                    c->idct_add= ff_idct_xvid_sse2_add;
-                    c->idct    = ff_idct_xvid_sse2;
-                    c->idct_permutation_type= FF_SSE2_IDCT_PERM;
-                }else if(mm_flags & FF_MM_MMXEXT){
-                    c->idct_put= ff_idct_xvid_mmx2_put;
-                    c->idct_add= ff_idct_xvid_mmx2_add;
-                    c->idct    = ff_idct_xvid_mmx2;
-                }else{
-                    c->idct_put= ff_idct_xvid_mmx_put;
-                    c->idct_add= ff_idct_xvid_mmx_add;
-                    c->idct    = ff_idct_xvid_mmx;
-                }
-            }
-        }
-
-        c->put_pixels_clamped = put_pixels_clamped_mmx;
-        c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
-        c->add_pixels_clamped = add_pixels_clamped_mmx;
-        c->clear_blocks = clear_blocks_mmx;
-
-#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
-        c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
-        c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
-        c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
-        c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
-
-        SET_HPEL_FUNCS(put, 0, 16, mmx);
-        SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
-        SET_HPEL_FUNCS(avg, 0, 16, mmx);
-        SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
-        SET_HPEL_FUNCS(put, 1, 8, mmx);
-        SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
-        SET_HPEL_FUNCS(avg, 1, 8, mmx);
-        SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
-
-        c->gmc= gmc_mmx;
-
-        c->add_bytes= add_bytes_mmx;
-        c->add_bytes_l2= add_bytes_l2_mmx;
-
-        c->draw_edges = draw_edges_mmx;
-
-        if (ENABLE_ANY_H263) {
-            c->h263_v_loop_filter= h263_v_loop_filter_mmx;
-            c->h263_h_loop_filter= h263_h_loop_filter_mmx;
-        }
-        c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
-        c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
-        c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
-
-        c->h264_idct_dc_add=
-        c->h264_idct_add= ff_h264_idct_add_mmx;
-        c->h264_idct8_dc_add=
-        c->h264_idct8_add= ff_h264_idct8_add_mmx;
-        if (mm_flags & FF_MM_SSE2)
-            c->h264_idct8_add= ff_h264_idct8_add_sse2;
-
-        if (mm_flags & FF_MM_MMXEXT) {
-            c->prefetch = prefetch_mmx2;
-
-            c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
-            c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
-
-            c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
-            c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
-            c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
-
-            c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
-            c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
-
-            c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
-            c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
-            c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
-
-            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
-            c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
-
-            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
-                c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
-                c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
-                c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
-                c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
-                c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
-
-                if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
-                    c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
-                    c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
-                }
-            }
-
-#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
-            c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
-            c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
-
-            SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
-            SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
-            SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
-            SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
-            SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
-            SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
-
-            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
-            SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
-            SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
-            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
-            SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
-            SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
-
-            SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
-            SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
-            SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
-            SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
-
-            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
-            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
-            c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
-            c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
-            c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
-            c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
-            c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
-            c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
-            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
-            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
-            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
-
-            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
-            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
-            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
-            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
-            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
-            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
-            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
-            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
-
-            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
-            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
-            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
-            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
-            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
-            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
-            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
-            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
-
-            if (ENABLE_CAVS_DECODER)
-                ff_cavsdsp_init_mmx2(c, avctx);
-
-            if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
-                ff_vc1dsp_init_mmx(c, avctx);
-
-            c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
-        } else if (mm_flags & FF_MM_3DNOW) {
-            c->prefetch = prefetch_3dnow;
-
-            c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
-            c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
-
-            c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
-            c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
-            c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
-
-            c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
-            c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
-
-            c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
-            c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
-            c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
-
-            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
-                c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
-                c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
-                c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
-                c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
-                c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
-            }
-
-            SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
-            SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
-            SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
-            SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
-            SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
-            SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
-
-            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
-            SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
-            SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
-            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
-            SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
-            SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
-
-            SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
-            SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
-            SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
-            SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
-
-            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
-            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
-
-            if (ENABLE_CAVS_DECODER)
-                ff_cavsdsp_init_3dnow(c, avctx);
-        }
-
-
-#define H264_QPEL_FUNCS(x, y, CPU)\
-            c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
-            c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
-            c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
-            c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
-        if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){
-            // these functions are slower than mmx on AMD, but faster on Intel
-/* FIXME works in most codecs, but crashes svq1 due to unaligned chroma
-            c->put_pixels_tab[0][0] = put_pixels16_sse2;
-            c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
-*/
-            H264_QPEL_FUNCS(0, 0, sse2);
-        }
-        if(mm_flags & FF_MM_SSE2){
-            H264_QPEL_FUNCS(0, 1, sse2);
-            H264_QPEL_FUNCS(0, 2, sse2);
-            H264_QPEL_FUNCS(0, 3, sse2);
-            H264_QPEL_FUNCS(1, 1, sse2);
-            H264_QPEL_FUNCS(1, 2, sse2);
-            H264_QPEL_FUNCS(1, 3, sse2);
-            H264_QPEL_FUNCS(2, 1, sse2);
-            H264_QPEL_FUNCS(2, 2, sse2);
-            H264_QPEL_FUNCS(2, 3, sse2);
-            H264_QPEL_FUNCS(3, 1, sse2);
-            H264_QPEL_FUNCS(3, 2, sse2);
-            H264_QPEL_FUNCS(3, 3, sse2);
-        }
-#ifdef HAVE_SSSE3
-        if(mm_flags & FF_MM_SSSE3){
-            H264_QPEL_FUNCS(1, 0, ssse3);
-            H264_QPEL_FUNCS(1, 1, ssse3);
-            H264_QPEL_FUNCS(1, 2, ssse3);
-            H264_QPEL_FUNCS(1, 3, ssse3);
-            H264_QPEL_FUNCS(2, 0, ssse3);
-            H264_QPEL_FUNCS(2, 1, ssse3);
-            H264_QPEL_FUNCS(2, 2, ssse3);
-            H264_QPEL_FUNCS(2, 3, ssse3);
-            H264_QPEL_FUNCS(3, 0, ssse3);
-            H264_QPEL_FUNCS(3, 1, ssse3);
-            H264_QPEL_FUNCS(3, 2, ssse3);
-            H264_QPEL_FUNCS(3, 3, ssse3);
-            c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd;
-            c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
-            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
-            c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
-            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
-            c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
-        }
-#endif
-
-#ifdef CONFIG_SNOW_DECODER
-        if(mm_flags & FF_MM_SSE2 & 0){
-            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
-#ifdef HAVE_7REGS
-            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
-#endif
-            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
-        }
-        else{
-            if(mm_flags & FF_MM_MMXEXT){
-            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
-#ifdef HAVE_7REGS
-            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
-#endif
-            }
-            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
-        }
-#endif
-
-        if(mm_flags & FF_MM_3DNOW){
-            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
-            c->vector_fmul = vector_fmul_3dnow;
-            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                c->float_to_int16 = float_to_int16_3dnow;
-                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
-            }
-        }
-        if(mm_flags & FF_MM_3DNOWEXT){
-            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
-            c->vector_fmul_window = vector_fmul_window_3dnow2;
-            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
-            }
-        }
-        if(mm_flags & FF_MM_SSE){
-            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
-            c->ac3_downmix = ac3_downmix_sse;
-            c->vector_fmul = vector_fmul_sse;
-            c->vector_fmul_reverse = vector_fmul_reverse_sse;
-            c->vector_fmul_add_add = vector_fmul_add_add_sse;
-            c->vector_fmul_window = vector_fmul_window_sse;
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
-            c->float_to_int16 = float_to_int16_sse;
-            c->float_to_int16_interleave = float_to_int16_interleave_sse;
-        }
-        if(mm_flags & FF_MM_3DNOW)
-            c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
-        if(mm_flags & FF_MM_SSE2){
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
-            c->float_to_int16 = float_to_int16_sse2;
-            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
-            c->add_int16 = add_int16_sse2;
-            c->sub_int16 = sub_int16_sse2;
-            c->scalarproduct_int16 = scalarproduct_int16_sse2;
-        }
-    }
-
-    if (ENABLE_ENCODERS)
-        dsputilenc_init_mmx(c, avctx);
-
-#if 0
-    // for speed testing
-    get_pixels = just_return;
-    put_pixels_clamped = just_return;
-    add_pixels_clamped = just_return;
-
-    pix_abs16x16 = just_return;
-    pix_abs16x16_x2 = just_return;
-    pix_abs16x16_y2 = just_return;
-    pix_abs16x16_xy2 = just_return;
-
-    put_pixels_tab[0] = just_return;
-    put_pixels_tab[1] = just_return;
-    put_pixels_tab[2] = just_return;
-    put_pixels_tab[3] = just_return;
-
-    put_no_rnd_pixels_tab[0] = just_return;
-    put_no_rnd_pixels_tab[1] = just_return;
-    put_no_rnd_pixels_tab[2] = just_return;
-    put_no_rnd_pixels_tab[3] = just_return;
-
-    avg_pixels_tab[0] = just_return;
-    avg_pixels_tab[1] = just_return;
-    avg_pixels_tab[2] = just_return;
-    avg_pixels_tab[3] = just_return;
-
-    avg_no_rnd_pixels_tab[0] = just_return;
-    avg_no_rnd_pixels_tab[1] = just_return;
-    avg_no_rnd_pixels_tab[2] = just_return;
-    avg_no_rnd_pixels_tab[3] = just_return;
-
-    //av_fdct = just_return;
-    //ff_idct = just_return;
-#endif
-}
diff --git a/libavcodec/i386/dsputil_mmx.h b/libavcodec/i386/dsputil_mmx.h
deleted file mode 100644
index 6c056f7..0000000
--- a/libavcodec/i386/dsputil_mmx.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * MMX optimized DSP utils
- * Copyright (c) 2007  Aurelien Jacobs <aurel at gnuage.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_I386_DSPUTIL_MMX_H
-#define AVCODEC_I386_DSPUTIL_MMX_H
-
-#include <stdint.h>
-#include "libavcodec/dsputil.h"
-
-typedef struct { uint64_t a, b; } xmm_t;
-
-extern const uint64_t ff_bone;
-extern const uint64_t ff_wtwo;
-
-extern const uint64_t ff_pdw_80000000[2];
-
-extern const uint64_t ff_pw_3;
-extern const uint64_t ff_pw_4;
-extern const xmm_t    ff_pw_5;
-extern const xmm_t    ff_pw_8;
-extern const uint64_t ff_pw_15;
-extern const xmm_t    ff_pw_16;
-extern const uint64_t ff_pw_20;
-extern const xmm_t    ff_pw_28;
-extern const xmm_t    ff_pw_32;
-extern const uint64_t ff_pw_42;
-extern const uint64_t ff_pw_64;
-extern const uint64_t ff_pw_96;
-extern const uint64_t ff_pw_128;
-extern const uint64_t ff_pw_255;
-
-extern const uint64_t ff_pb_1;
-extern const uint64_t ff_pb_3;
-extern const uint64_t ff_pb_7;
-extern const uint64_t ff_pb_1F;
-extern const uint64_t ff_pb_3F;
-extern const uint64_t ff_pb_81;
-extern const uint64_t ff_pb_A1;
-extern const uint64_t ff_pb_FC;
-
-extern const double ff_pd_1[2];
-extern const double ff_pd_2[2];
-
-#define LOAD4(stride,in,a,b,c,d)\
-    "movq 0*"#stride"+"#in", "#a"\n\t"\
-    "movq 1*"#stride"+"#in", "#b"\n\t"\
-    "movq 2*"#stride"+"#in", "#c"\n\t"\
-    "movq 3*"#stride"+"#in", "#d"\n\t"
-
-#define STORE4(stride,out,a,b,c,d)\
-    "movq "#a", 0*"#stride"+"#out"\n\t"\
-    "movq "#b", 1*"#stride"+"#out"\n\t"\
-    "movq "#c", 2*"#stride"+"#out"\n\t"\
-    "movq "#d", 3*"#stride"+"#out"\n\t"
-
-/* in/out: mma=mma+mmb, mmb=mmb-mma */
-#define SUMSUB_BA( a, b ) \
-    "paddw "#b", "#a" \n\t"\
-    "paddw "#b", "#b" \n\t"\
-    "psubw "#a", "#b" \n\t"
-
-#define SBUTTERFLY(a,b,t,n,m)\
-    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
-    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
-    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
-
-#define TRANSPOSE4(a,b,c,d,t)\
-    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
-    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
-    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
-    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
-
-// e,f,g,h can be memory
-// out: a,d,t,c
-#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\
-    "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\
-    "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\
-    "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\
-    "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\
-    SBUTTERFLY(a, b, t, bw, q)   /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
-                                 /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\
-    SBUTTERFLY(c, d, b, bw, q)   /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
-                                 /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
-    SBUTTERFLY(a, c, d, wd, q)   /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
-                                 /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
-    SBUTTERFLY(t, b, c, wd, q)   /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\
-                                 /* c= a3 b3 c3 d3 e3 f3 g3 h3 */
-
-#ifdef ARCH_X86_64
-// permutes 01234567 -> 05736421
-#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
-    SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
-    SBUTTERFLY(c,d,b,wd,dqa)\
-    SBUTTERFLY(e,f,d,wd,dqa)\
-    SBUTTERFLY(g,h,f,wd,dqa)\
-    SBUTTERFLY(a,c,h,dq,dqa)\
-    SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
-    SBUTTERFLY(e,g,b,dq,dqa)\
-    SBUTTERFLY(d,f,g,dq,dqa)\
-    SBUTTERFLY(a,e,f,qdq,dqa)\
-    SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
-    SBUTTERFLY(h,b,d,qdq,dqa)\
-    SBUTTERFLY(c,g,b,qdq,dqa)\
-    "movdqa %%xmm8, "#g"              \n\t"
-#else
-#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
-    "movdqa "#h", "#t"                \n\t"\
-    SBUTTERFLY(a,b,h,wd,dqa)\
-    "movdqa "#h", 16"#t"              \n\t"\
-    "movdqa "#t", "#h"                \n\t"\
-    SBUTTERFLY(c,d,b,wd,dqa)\
-    SBUTTERFLY(e,f,d,wd,dqa)\
-    SBUTTERFLY(g,h,f,wd,dqa)\
-    SBUTTERFLY(a,c,h,dq,dqa)\
-    "movdqa "#h", "#t"                \n\t"\
-    "movdqa 16"#t", "#h"              \n\t"\
-    SBUTTERFLY(h,b,c,dq,dqa)\
-    SBUTTERFLY(e,g,b,dq,dqa)\
-    SBUTTERFLY(d,f,g,dq,dqa)\
-    SBUTTERFLY(a,e,f,qdq,dqa)\
-    SBUTTERFLY(h,d,e,qdq,dqa)\
-    "movdqa "#h", 16"#t"              \n\t"\
-    "movdqa "#t", "#h"                \n\t"\
-    SBUTTERFLY(h,b,d,qdq,dqa)\
-    SBUTTERFLY(c,g,b,qdq,dqa)\
-    "movdqa 16"#t", "#g"              \n\t"
-#endif
-
-#define MOVQ_WONE(regd) \
-    __asm__ volatile ( \
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
-    "psrlw $15, %%" #regd ::)
-
-void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx);
-
-#endif /* AVCODEC_I386_DSPUTIL_MMX_H */
diff --git a/libavcodec/i386/dsputil_mmx_avg_template.c b/libavcodec/i386/dsputil_mmx_avg_template.c
deleted file mode 100644
index a3f2068..0000000
--- a/libavcodec/i386/dsputil_mmx_avg_template.c
+++ /dev/null
@@ -1,900 +0,0 @@
-/*
- * DSP utils : average functions are compiled twice for 3dnow/mmx2
- * Copyright (c) 2000, 2001 Fabrice Bellard.
- * Copyright (c) 2002-2004 Michael Niedermayer
- *
- * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
- * mostly rewritten by Michael Niedermayer <michaelni at gmx.at>
- * and improved by Zdenek Kabelac <kabi at users.sf.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/* This header intentionally has no multiple inclusion guards. It is meant to
- * be included multiple times and generates different code depending on the
- * value of certain #defines. */
-
-/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
-   clobber bug - now it will work with 2.95.2 and also with -fPIC
- */
-static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "movd   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $4, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movd   (%1), %%mm1             \n\t"
-        "movd   (%2), %%mm2             \n\t"
-        "movd   4(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movd   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movd   (%1), %%mm1             \n\t"
-        "movd   8(%2), %%mm2            \n\t"
-        "movd   12(%2), %%mm3           \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movd   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $16, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-}
-
-
-static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "pcmpeqb %%mm6, %%mm6           \n\t"
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%2), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   16(%2), %%mm2           \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "movd   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $4, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movd   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 4(%2), %%mm1             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movd   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movd   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movd   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 8(%2), %%mm0             \n\t"
-        PAVGB" 12(%2), %%mm1            \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movd   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movd   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $16, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-}
-
-
-static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq 8(%1), %%mm2              \n\t"
-        "movq 8(%1, %3), %%mm3          \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 9(%1), %%mm2             \n\t"
-        PAVGB" 9(%1, %3), %%mm3         \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "movq %%mm2, 8(%2)              \n\t"
-        "movq %%mm3, 8(%2, %3)          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq 8(%1), %%mm2              \n\t"
-        "movq 8(%1, %3), %%mm3          \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 9(%1), %%mm2             \n\t"
-        PAVGB" 9(%1, %3), %%mm3         \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "movq %%mm2, 8(%2)              \n\t"
-        "movq %%mm3, 8(%2, %3)          \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "pcmpeqb %%mm6, %%mm6           \n\t"
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "movq   (%2), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%2), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   16(%2), %%mm2           \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-/* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BONE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "movq 1(%1), %%mm1              \n\t"
-        "movq 1(%1, %3), %%mm3          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm0           \n\t"
-        "psubusb %%mm6, %%mm2           \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq 1(%1), %%mm1              \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "movq 1(%1, %3), %%mm3          \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm0           \n\t"
-        "psubusb %%mm6, %%mm2           \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "sub %3, %2                     \n\t"
-        "1:                             \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        "movq %%mm0, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D" (block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-/* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BONE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "sub %3, %2                     \n\t"
-        "1:                             \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm1           \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        "movq %%mm0, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm1           \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D" (block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%2), %%mm0               \n\t"
-        "movq (%2, %3), %%mm1           \n\t"
-        PAVGB" (%1), %%mm0              \n\t"
-        PAVGB" (%1, %3), %%mm1          \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "movq (%2), %%mm0               \n\t"
-        "movq (%2, %3), %%mm1           \n\t"
-        PAVGB" (%1), %%mm0              \n\t"
-        PAVGB" (%1, %3), %%mm1          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm2         \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" (%2, %3), %%mm2          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm2         \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" (%2, %3), %%mm2          \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "sub %3, %2                     \n\t"
-        "1:                             \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        "movq (%2, %3), %%mm3           \n\t"
-        "movq (%2, %%"REG_a"), %%mm4    \n\t"
-        PAVGB" %%mm3, %%mm0             \n\t"
-        PAVGB" %%mm4, %%mm1             \n\t"
-        "movq %%mm0, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq (%2, %3), %%mm3           \n\t"
-        "movq (%2, %%"REG_a"), %%mm4    \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        PAVGB" %%mm4, %%mm1             \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-/* Note this is not correctly rounded, but this function is only
- * used for B-frames so it does not matter. */
-static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BONE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-         ASMALIGN(3)
-        "1:                             \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "psubusb %%mm6, %%mm2           \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 1(%1, %%"REG_a"), %%mm2  \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" (%2, %3), %%mm1          \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 1(%1, %%"REG_a"), %%mm0  \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        PAVGB" (%2), %%mm2              \n\t"
-        PAVGB" (%2, %3), %%mm1          \n\t"
-        "movq %%mm2, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a,  "memory");
-}
-
-static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    do {
-        __asm__ volatile(
-            "movd (%1), %%mm0               \n\t"
-            "movd (%1, %2), %%mm1           \n\t"
-            "movd (%1, %2, 2), %%mm2        \n\t"
-            "movd (%1, %3), %%mm3           \n\t"
-            PAVGB" (%0), %%mm0              \n\t"
-            PAVGB" (%0, %2), %%mm1          \n\t"
-            PAVGB" (%0, %2, 2), %%mm2       \n\t"
-            PAVGB" (%0, %3), %%mm3          \n\t"
-            "movd %%mm0, (%1)               \n\t"
-            "movd %%mm1, (%1, %2)           \n\t"
-            "movd %%mm2, (%1, %2, 2)        \n\t"
-            "movd %%mm3, (%1, %3)           \n\t"
-            ::"S"(pixels), "D"(block),
-             "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
-            :"memory");
-        block += 4*line_size;
-        pixels += 4*line_size;
-        h -= 4;
-    } while(h > 0);
-}
-
-//FIXME the following could be optimized too ...
-static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
-    DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
-}
-
-#define QPEL_2TAP_L3(OPNAME) \
-static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
-    __asm__ volatile(\
-        "1:                    \n\t"\
-        "movq   (%1,%2), %%mm0 \n\t"\
-        "movq  8(%1,%2), %%mm1 \n\t"\
-        PAVGB"  (%1,%3), %%mm0 \n\t"\
-        PAVGB" 8(%1,%3), %%mm1 \n\t"\
-        PAVGB"  (%1),    %%mm0 \n\t"\
-        PAVGB" 8(%1),    %%mm1 \n\t"\
-        STORE_OP( (%1,%4),%%mm0)\
-        STORE_OP(8(%1,%4),%%mm1)\
-        "movq  %%mm0,  (%1,%4) \n\t"\
-        "movq  %%mm1, 8(%1,%4) \n\t"\
-        "add   %5, %1          \n\t"\
-        "decl  %0              \n\t"\
-        "jnz   1b              \n\t"\
-        :"+g"(h), "+r"(src)\
-        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
-         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
-        :"memory"\
-    );\
-}\
-static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
-    __asm__ volatile(\
-        "1:                    \n\t"\
-        "movq   (%1,%2), %%mm0 \n\t"\
-        PAVGB"  (%1,%3), %%mm0 \n\t"\
-        PAVGB"  (%1),    %%mm0 \n\t"\
-        STORE_OP((%1,%4),%%mm0)\
-        "movq  %%mm0,  (%1,%4) \n\t"\
-        "add   %5, %1          \n\t"\
-        "decl  %0              \n\t"\
-        "jnz   1b              \n\t"\
-        :"+g"(h), "+r"(src)\
-        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
-         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
-        :"memory"\
-    );\
-}
-
-#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
-QPEL_2TAP_L3(avg_)
-#undef STORE_OP
-#define STORE_OP(a,b)
-QPEL_2TAP_L3(put_)
-#undef STORE_OP
-#undef QPEL_2TAP_L3
diff --git a/libavcodec/i386/dsputil_mmx_qns_template.c b/libavcodec/i386/dsputil_mmx_qns_template.c
deleted file mode 100644
index 1f484e7..0000000
--- a/libavcodec/i386/dsputil_mmx_qns_template.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
- * Copyright (c) 2004 Michael Niedermayer
- *
- * MMX optimization by Michael Niedermayer <michaelni at gmx.at>
- * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng at gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/* This header intentionally has no multiple inclusion guards. It is meant to
- * be included multiple times and generates different code depending on the
- * value of certain #defines. */
-
-#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
-
-static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
-{
-    x86_reg i=0;
-
-    assert(FFABS(scale) < MAX_ABS);
-    scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
-
-    SET_RND(mm6);
-    __asm__ volatile(
-        "pxor %%mm7, %%mm7              \n\t"
-        "movd  %4, %%mm5                \n\t"
-        "punpcklwd %%mm5, %%mm5         \n\t"
-        "punpcklwd %%mm5, %%mm5         \n\t"
-        ASMALIGN(4)
-        "1:                             \n\t"
-        "movq  (%1, %0), %%mm0          \n\t"
-        "movq  8(%1, %0), %%mm1         \n\t"
-        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
-        "paddw (%2, %0), %%mm0          \n\t"
-        "paddw 8(%2, %0), %%mm1         \n\t"
-        "psraw $6, %%mm0                \n\t"
-        "psraw $6, %%mm1                \n\t"
-        "pmullw (%3, %0), %%mm0         \n\t"
-        "pmullw 8(%3, %0), %%mm1        \n\t"
-        "pmaddwd %%mm0, %%mm0           \n\t"
-        "pmaddwd %%mm1, %%mm1           \n\t"
-        "paddd %%mm1, %%mm0             \n\t"
-        "psrld $4, %%mm0                \n\t"
-        "paddd %%mm0, %%mm7             \n\t"
-        "add $16, %0                    \n\t"
-        "cmp $128, %0                   \n\t" //FIXME optimize & bench
-        " jb 1b                         \n\t"
-        PHADDD(%%mm7, %%mm6)
-        "psrld $2, %%mm7                \n\t"
-        "movd %%mm7, %0                 \n\t"
-
-        : "+r" (i)
-        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
-    );
-    return i;
-}
-
-static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
-{
-    x86_reg i=0;
-
-    if(FFABS(scale) < MAX_ABS){
-        scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
-        SET_RND(mm6);
-        __asm__ volatile(
-                "movd  %3, %%mm5        \n\t"
-                "punpcklwd %%mm5, %%mm5 \n\t"
-                "punpcklwd %%mm5, %%mm5 \n\t"
-                ASMALIGN(4)
-                "1:                     \n\t"
-                "movq  (%1, %0), %%mm0  \n\t"
-                "movq  8(%1, %0), %%mm1 \n\t"
-                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
-                "paddw (%2, %0), %%mm0  \n\t"
-                "paddw 8(%2, %0), %%mm1 \n\t"
-                "movq %%mm0, (%2, %0)   \n\t"
-                "movq %%mm1, 8(%2, %0)  \n\t"
-                "add $16, %0            \n\t"
-                "cmp $128, %0           \n\t" // FIXME optimize & bench
-                " jb 1b                 \n\t"
-
-                : "+r" (i)
-                : "r"(basis), "r"(rem), "g"(scale)
-        );
-    }else{
-        for(i=0; i<8*8; i++){
-            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
-        }
-    }
-}
diff --git a/libavcodec/i386/dsputil_mmx_rnd_template.c b/libavcodec/i386/dsputil_mmx_rnd_template.c
deleted file mode 100644
index 5ef06da..0000000
--- a/libavcodec/i386/dsputil_mmx_rnd_template.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- * DSP utils mmx functions are compiled twice for rnd/no_rnd
- * Copyright (c) 2000, 2001 Fabrice Bellard.
- * Copyright (c) 2003-2004 Michael Niedermayer <michaelni at gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
- * mostly rewritten by Michael Niedermayer <michaelni at gmx.at>
- * and improved by Zdenek Kabelac <kabi at users.sf.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/* This header intentionally has no multiple inclusion guards. It is meant to
- * be included multiple times and generates different code depending on the
- * value of certain #defines. */
-
-// put_pixels
-static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"REG_a"     \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-        " jz 1f                         \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
-        "movq   %%mm4, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm5, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   16(%2), %%mm1           \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm2             \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $32, %2                 \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm5, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-}
-
-static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea        (%3, %3), %%"REG_a" \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "movq   8(%1), %%mm0            \n\t"
-        "movq   9(%1), %%mm1            \n\t"
-        "movq   8(%1, %3), %%mm2        \n\t"
-        "movq   9(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, 8(%2)            \n\t"
-        "movq   %%mm5, 8(%2, %3)        \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "movq   8(%1), %%mm0            \n\t"
-        "movq   9(%1), %%mm1            \n\t"
-        "movq   8(%1, %3), %%mm2        \n\t"
-        "movq   9(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, 8(%2)            \n\t"
-        "movq   %%mm5, 8(%2, %3)        \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-        " jz 1f                         \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "movq   8(%1), %%mm2            \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "movq   %%mm5, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "movq   8(%1), %%mm2            \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "movq   %%mm5, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   16(%2), %%mm1           \n\t"
-        "movq   8(%1), %%mm2            \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "movq   %%mm5, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-}
-
-static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"),%%mm2   \n\t"
-        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"),%%mm0   \n\t"
-        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_ZERO(mm7);
-    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm4            \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "xor    %%"REG_a", %%"REG_a"    \n\t"
-        "add    %3, %1                  \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
-        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "paddusw %%mm2, %%mm0           \n\t"
-        "paddusw %%mm3, %%mm1           \n\t"
-        "paddusw %%mm6, %%mm4           \n\t"
-        "paddusw %%mm6, %%mm5           \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "psrlw  $2, %%mm4               \n\t"
-        "psrlw  $2, %%mm5               \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "movq   %%mm4, (%2, %%"REG_a")  \n\t"
-        "add    %3, %%"REG_a"           \n\t"
-
-        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
-        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm2, %%mm4           \n\t"
-        "paddusw %%mm3, %%mm5           \n\t"
-        "paddusw %%mm6, %%mm0           \n\t"
-        "paddusw %%mm6, %%mm1           \n\t"
-        "paddusw %%mm4, %%mm0           \n\t"
-        "paddusw %%mm5, %%mm1           \n\t"
-        "psrlw  $2, %%mm0               \n\t"
-        "psrlw  $2, %%mm1               \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "movq   %%mm0, (%2, %%"REG_a")  \n\t"
-        "add    %3, %%"REG_a"           \n\t"
-
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels)
-        :"D"(block), "r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-// avg_pixels
-static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movd  %0, %%mm0           \n\t"
-             "movd  %1, %%mm1           \n\t"
-             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movd  %%mm2, %0           \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-// in case more speed is needed - unroling would certainly help
-static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             "movq  8%0, %%mm0          \n\t"
-             "movq  8%1, %%mm1          \n\t"
-             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, 8%0          \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-
-static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  %2, %%mm1            \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*dst)
-            :"m"(*src1), "m"(*src2)
-            :"memory");
-        dst += dstStride;
-        src1 += src1Stride;
-        src2 += 8;
-    } while (--h);
-}
-
-static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            "movq  8%1, %%mm0           \n\t"
-            "movq  9%1, %%mm1           \n\t"
-            "movq  8%0, %%mm3           \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8%0           \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-
-static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  %2, %%mm1            \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            "movq  8%1, %%mm0           \n\t"
-            "movq  8%2, %%mm1           \n\t"
-            "movq  8%0, %%mm3           \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8%0           \n\t"
-            :"+m"(*dst)
-            :"m"(*src1), "m"(*src2)
-            :"memory");
-        dst += dstStride;
-        src1 += src1Stride;
-        src2 += 16;
-    } while (--h);
-}
-
-static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"REG_a"     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"), %%mm2  \n\t"
-        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
-        "movq   (%2), %%mm3             \n\t"
-        PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
-        "movq   (%2, %3), %%mm3         \n\t"
-        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
-        "movq   %%mm0, (%2)             \n\t"
-        "movq   %%mm1, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
-        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
-        "movq   (%2), %%mm3             \n\t"
-        PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
-        "movq   (%2, %3), %%mm3         \n\t"
-        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
-        "movq   %%mm2, (%2)             \n\t"
-        "movq   %%mm1, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-// this routine is 'slightly' suboptimal but mostly unused
-static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_ZERO(mm7);
-    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm4            \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "xor    %%"REG_a", %%"REG_a"    \n\t"
-        "add    %3, %1                  \n\t"
-        ASMALIGN(3)
-        "1:                             \n\t"
-        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
-        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "paddusw %%mm2, %%mm0           \n\t"
-        "paddusw %%mm3, %%mm1           \n\t"
-        "paddusw %%mm6, %%mm4           \n\t"
-        "paddusw %%mm6, %%mm5           \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "psrlw  $2, %%mm4               \n\t"
-        "psrlw  $2, %%mm5               \n\t"
-                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-                "pcmpeqd %%mm2, %%mm2   \n\t"
-                "paddb %%mm2, %%mm2     \n\t"
-                PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
-                "movq   %%mm5, (%2, %%"REG_a")  \n\t"
-        "add    %3, %%"REG_a"                \n\t"
-
-        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
-        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm2, %%mm4           \n\t"
-        "paddusw %%mm3, %%mm5           \n\t"
-        "paddusw %%mm6, %%mm0           \n\t"
-        "paddusw %%mm6, %%mm1           \n\t"
-        "paddusw %%mm4, %%mm0           \n\t"
-        "paddusw %%mm5, %%mm1           \n\t"
-        "psrlw  $2, %%mm0               \n\t"
-        "psrlw  $2, %%mm1               \n\t"
-                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-                "pcmpeqd %%mm2, %%mm2   \n\t"
-                "paddb %%mm2, %%mm2     \n\t"
-                PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
-                "movq   %%mm1, (%2, %%"REG_a")  \n\t"
-        "add    %3, %%"REG_a"           \n\t"
-
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels)
-        :"D"(block), "r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-//FIXME optimize
-static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-
-static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
-    DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
-}
-
-static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-
-static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
-    DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
-}
diff --git a/libavcodec/i386/dsputil_yasm.asm b/libavcodec/i386/dsputil_yasm.asm
deleted file mode 100644
index 09beb5c..0000000
--- a/libavcodec/i386/dsputil_yasm.asm
+++ /dev/null
@@ -1,92 +0,0 @@
-;******************************************************************************
-;* MMX optimized DSP utils
-;* Copyright (c) 2008 Loren Merritt
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "x86inc.asm"
-
-section .text align=16
-
-%macro PSWAPD_SSE 2
-    pshufw %1, %2, 0x4e
-%endmacro
-%macro PSWAPD_3DN1 2
-    movq  %1, %2
-    psrlq %1, 32
-    punpckldq %1, %2
-%endmacro
-
-%macro FLOAT_TO_INT16_INTERLEAVE6 1
-; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
-cglobal ff_float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
-%ifdef ARCH_X86_64
-    %define lend r10d
-    mov     lend, r2d
-%else
-    %define lend dword r2m
-%endif
-    mov src1q, [srcq+1*gprsize]
-    mov src2q, [srcq+2*gprsize]
-    mov src3q, [srcq+3*gprsize]
-    mov src4q, [srcq+4*gprsize]
-    mov src5q, [srcq+5*gprsize]
-    mov srcq,  [srcq]
-    sub src1q, srcq
-    sub src2q, srcq
-    sub src3q, srcq
-    sub src4q, srcq
-    sub src5q, srcq
-.loop:
-    cvtps2pi   mm0, [srcq]
-    cvtps2pi   mm1, [srcq+src1q]
-    cvtps2pi   mm2, [srcq+src2q]
-    cvtps2pi   mm3, [srcq+src3q]
-    cvtps2pi   mm4, [srcq+src4q]
-    cvtps2pi   mm5, [srcq+src5q]
-    packssdw   mm0, mm3
-    packssdw   mm1, mm4
-    packssdw   mm2, mm5
-    pswapd     mm3, mm0
-    punpcklwd  mm0, mm1
-    punpckhwd  mm1, mm2
-    punpcklwd  mm2, mm3
-    pswapd     mm3, mm0
-    punpckldq  mm0, mm2
-    punpckhdq  mm2, mm1
-    punpckldq  mm1, mm3
-    movq [dstq   ], mm0
-    movq [dstq+16], mm2
-    movq [dstq+ 8], mm1
-    add srcq, 8
-    add dstq, 24
-    sub lend, 2
-    jg .loop
-    emms
-    RET
-%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
-
-%define pswapd PSWAPD_SSE
-FLOAT_TO_INT16_INTERLEAVE6 sse
-%define cvtps2pi pf2id
-%define pswapd PSWAPD_3DN1
-FLOAT_TO_INT16_INTERLEAVE6 3dnow
-%undef pswapd
-FLOAT_TO_INT16_INTERLEAVE6 3dn2
-%undef cvtps2pi
-
diff --git a/libavcodec/i386/fft_mmx.asm b/libavcodec/i386/fft_mmx.asm
deleted file mode 100644
index c0a9bd5..0000000
--- a/libavcodec/i386/fft_mmx.asm
+++ /dev/null
@@ -1,467 +0,0 @@
-;******************************************************************************
-;* FFT transform with SSE/3DNow optimizations
-;* Copyright (c) 2008 Loren Merritt
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-; These functions are not individually interchangeable with the C versions.
-; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
-; in blocks as conventient to the vector size.
-; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
-
-%include "x86inc.asm"
-
-SECTION_RODATA
-
-%define M_SQRT1_2 0.70710678118654752440
-ps_root2: times 4 dd M_SQRT1_2
-ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
-ps_m1p1: dd 1<<31, 0
-
-%assign i 16
-%rep 13
-cextern ff_cos_ %+ i
-%assign i i<<1
-%endrep
-
-%ifdef ARCH_X86_64
-    %define pointer dq
-%else
-    %define pointer dd
-%endif
-
-%macro IF0 1+
-%endmacro
-%macro IF1 1+
-    %1
-%endmacro
-
-section .text align=16
-
-%macro T2_3DN 4 ; z0, z1, mem0, mem1
-    mova     %1, %3
-    mova     %2, %1
-    pfadd    %1, %4
-    pfsub    %2, %4
-%endmacro
-
-%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
-    mova     %5, %3
-    pfsub    %3, %4
-    pfadd    %5, %4 ; {t6,t5}
-    pxor     %3, [ps_m1p1 GLOBAL] ; {t8,t7}
-    mova     %6, %1
-    pswapd   %3, %3
-    pfadd    %1, %5 ; {r0,i0}
-    pfsub    %6, %5 ; {r2,i2}
-    mova     %4, %2
-    pfadd    %2, %3 ; {r1,i1}
-    pfsub    %4, %3 ; {r3,i3}
-    SWAP     %3, %6
-%endmacro
-
-; in:  %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
-; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
-%macro T4_SSE 3
-    mova     %3, %1
-    shufps   %1, %2, 0x64 ; {r0,i0,r3,i2}
-    shufps   %3, %2, 0xce ; {r1,i1,r2,i3}
-    mova     %2, %1
-    addps    %1, %3       ; {t1,t2,t6,t5}
-    subps    %2, %3       ; {t3,t4,t8,t7}
-    mova     %3, %1
-    shufps   %1, %2, 0x44 ; {t1,t2,t3,t4}
-    shufps   %3, %2, 0xbe ; {t6,t5,t7,t8}
-    mova     %2, %1
-    addps    %1, %3       ; {r0,i0,r1,i1}
-    subps    %2, %3       ; {r2,i2,r3,i3}
-    mova     %3, %1
-    shufps   %1, %2, 0x88 ; {r0,r1,r2,r3}
-    shufps   %3, %2, 0xdd ; {i0,i1,i2,i3}
-    SWAP     %2, %3
-%endmacro
-
-%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
-    mova     %5, %3
-    shufps   %3, %4, 0x44 ; {r4,i4,r6,i6}
-    shufps   %5, %4, 0xee ; {r5,i5,r7,i7}
-    mova     %6, %3
-    subps    %3, %5       ; {r5,i5,r7,i7}
-    addps    %6, %5       ; {t1,t2,t3,t4}
-    mova     %5, %3
-    shufps   %5, %5, 0xb1 ; {i5,r5,i7,r7}
-    mulps    %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7}
-    mulps    %5, [ps_root2 GLOBAL]
-    addps    %3, %5       ; {t8,t7,ta,t9}
-    mova     %5, %6
-    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
-    shufps   %5, %3, 0x9c ; {t1,t4,t7,ta}
-    mova     %3, %6
-    addps    %6, %5       ; {t1,t2,t9,ta}
-    subps    %3, %5       ; {t6,t5,tc,tb}
-    mova     %5, %6
-    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
-    shufps   %5, %3, 0x8d ; {t2,ta,t6,tc}
-    mova     %3, %1
-    mova     %4, %2
-    addps    %1, %6       ; {r0,r1,r2,r3}
-    addps    %2, %5       ; {i0,i1,i2,i3}
-    subps    %3, %6       ; {r4,r5,r6,r7}
-    subps    %4, %5       ; {i4,i5,i6,i7}
-%endmacro
-
-; scheduled for cpu-bound sizes
-%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
-IF%1 mova    m4, Z(4)
-IF%1 mova    m5, Z(5)
-    mova     m0, %2 ; wre
-    mova     m2, m4
-    mova     m1, %3 ; wim
-    mova     m3, m5
-    mulps    m2, m0 ; r2*wre
-IF%1 mova    m6, Z(6)
-    mulps    m3, m1 ; i2*wim
-IF%1 mova    m7, Z(7)
-    mulps    m4, m1 ; r2*wim
-    mulps    m5, m0 ; i2*wre
-    addps    m2, m3 ; r2*wre + i2*wim
-    mova     m3, m1
-    mulps    m1, m6 ; r3*wim
-    subps    m5, m4 ; i2*wre - r2*wim
-    mova     m4, m0
-    mulps    m3, m7 ; i3*wim
-    mulps    m4, m6 ; r3*wre
-    mulps    m0, m7 ; i3*wre
-    subps    m4, m3 ; r3*wre - i3*wim
-    mova     m3, Z(0)
-    addps    m0, m1 ; i3*wre + r3*wim
-    mova     m1, m4
-    addps    m4, m2 ; t5
-    subps    m1, m2 ; t3
-    subps    m3, m4 ; r2
-    addps    m4, Z(0) ; r0
-    mova     m6, Z(2)
-    mova   Z(4), m3
-    mova   Z(0), m4
-    mova     m3, m5
-    subps    m5, m0 ; t4
-    mova     m4, m6
-    subps    m6, m5 ; r3
-    addps    m5, m4 ; r1
-    mova   Z(6), m6
-    mova   Z(2), m5
-    mova     m2, Z(3)
-    addps    m3, m0 ; t6
-    subps    m2, m1 ; i3
-    mova     m7, Z(1)
-    addps    m1, Z(3) ; i1
-    mova   Z(7), m2
-    mova   Z(3), m1
-    mova     m4, m7
-    subps    m7, m3 ; i2
-    addps    m3, m4 ; i0
-    mova   Z(5), m7
-    mova   Z(1), m3
-%endmacro
-
-; scheduled to avoid store->load aliasing
-%macro PASS_BIG 1 ; (!interleave)
-    mova     m4, Z(4) ; r2
-    mova     m5, Z(5) ; i2
-    mova     m2, m4
-    mova     m0, [wq] ; wre
-    mova     m3, m5
-    mova     m1, [wq+o1q] ; wim
-    mulps    m2, m0 ; r2*wre
-    mova     m6, Z(6) ; r3
-    mulps    m3, m1 ; i2*wim
-    mova     m7, Z(7) ; i3
-    mulps    m4, m1 ; r2*wim
-    mulps    m5, m0 ; i2*wre
-    addps    m2, m3 ; r2*wre + i2*wim
-    mova     m3, m1
-    mulps    m1, m6 ; r3*wim
-    subps    m5, m4 ; i2*wre - r2*wim
-    mova     m4, m0
-    mulps    m3, m7 ; i3*wim
-    mulps    m4, m6 ; r3*wre
-    mulps    m0, m7 ; i3*wre
-    subps    m4, m3 ; r3*wre - i3*wim
-    mova     m3, Z(0)
-    addps    m0, m1 ; i3*wre + r3*wim
-    mova     m1, m4
-    addps    m4, m2 ; t5
-    subps    m1, m2 ; t3
-    subps    m3, m4 ; r2
-    addps    m4, Z(0) ; r0
-    mova     m6, Z(2)
-    mova   Z(4), m3
-    mova   Z(0), m4
-    mova     m3, m5
-    subps    m5, m0 ; t4
-    mova     m4, m6
-    subps    m6, m5 ; r3
-    addps    m5, m4 ; r1
-IF%1 mova  Z(6), m6
-IF%1 mova  Z(2), m5
-    mova     m2, Z(3)
-    addps    m3, m0 ; t6
-    subps    m2, m1 ; i3
-    mova     m7, Z(1)
-    addps    m1, Z(3) ; i1
-IF%1 mova  Z(7), m2
-IF%1 mova  Z(3), m1
-    mova     m4, m7
-    subps    m7, m3 ; i2
-    addps    m3, m4 ; i0
-IF%1 mova  Z(5), m7
-IF%1 mova  Z(1), m3
-%if %1==0
-    mova     m4, m5 ; r1
-    mova     m0, m6 ; r3
-    unpcklps m5, m1
-    unpckhps m4, m1
-    unpcklps m6, m2
-    unpckhps m0, m2
-    mova     m1, Z(0)
-    mova     m2, Z(4)
-    mova   Z(2), m5
-    mova   Z(3), m4
-    mova   Z(6), m6
-    mova   Z(7), m0
-    mova     m5, m1 ; r0
-    mova     m4, m2 ; r2
-    unpcklps m1, m3
-    unpckhps m5, m3
-    unpcklps m2, m7
-    unpckhps m4, m7
-    mova   Z(0), m1
-    mova   Z(1), m5
-    mova   Z(4), m2
-    mova   Z(5), m4
-%endif
-%endmacro
-
-%macro PUNPCK 3
-    mova      %3, %1
-    punpckldq %1, %2
-    punpckhdq %3, %2
-%endmacro
-
-INIT_XMM
-
-%define Z(x) [r0+mmsize*x]
-
-align 16
-fft4_sse:
-    mova     m0, Z(0)
-    mova     m1, Z(1)
-    T4_SSE   m0, m1, m2
-    mova   Z(0), m0
-    mova   Z(1), m1
-    ret
-
-align 16
-fft8_sse:
-    mova     m0, Z(0)
-    mova     m1, Z(1)
-    T4_SSE   m0, m1, m2
-    mova     m2, Z(2)
-    mova     m3, Z(3)
-    T8_SSE   m0, m1, m2, m3, m4, m5
-    mova   Z(0), m0
-    mova   Z(1), m1
-    mova   Z(2), m2
-    mova   Z(3), m3
-    ret
-
-align 16
-fft16_sse:
-    mova     m0, Z(0)
-    mova     m1, Z(1)
-    T4_SSE   m0, m1, m2
-    mova     m2, Z(2)
-    mova     m3, Z(3)
-    T8_SSE   m0, m1, m2, m3, m4, m5
-    mova     m4, Z(4)
-    mova     m5, Z(5)
-    mova   Z(0), m0
-    mova   Z(1), m1
-    mova   Z(2), m2
-    mova   Z(3), m3
-    T4_SSE   m4, m5, m6
-    mova     m6, Z(6)
-    mova     m7, Z(7)
-    T4_SSE   m6, m7, m0
-    PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL]
-    ret
-
-
-INIT_MMX
-
-%macro FFT48_3DN 1
-align 16
-fft4%1:
-    T2_3DN   m0, m1, Z(0), Z(1)
-    mova     m2, Z(2)
-    mova     m3, Z(3)
-    T4_3DN   m0, m1, m2, m3, m4, m5
-    PUNPCK   m0, m1, m4
-    PUNPCK   m2, m3, m5
-    mova   Z(0), m0
-    mova   Z(1), m4
-    mova   Z(2), m2
-    mova   Z(3), m5
-    ret
-
-align 16
-fft8%1:
-    T2_3DN   m0, m1, Z(0), Z(1)
-    mova     m2, Z(2)
-    mova     m3, Z(3)
-    T4_3DN   m0, m1, m2, m3, m4, m5
-    mova   Z(0), m0
-    mova   Z(2), m2
-    T2_3DN   m4, m5, Z(4), Z(5)
-    T2_3DN   m6, m7, Z(6), Z(7)
-    pswapd   m0, m5
-    pswapd   m2, m7
-    pxor     m0, [ps_m1p1 GLOBAL]
-    pxor     m2, [ps_m1p1 GLOBAL]
-    pfsub    m5, m0
-    pfadd    m7, m2
-    pfmul    m5, [ps_root2 GLOBAL]
-    pfmul    m7, [ps_root2 GLOBAL]
-    T4_3DN   m1, m3, m5, m7, m0, m2
-    mova   Z(5), m5
-    mova   Z(7), m7
-    mova     m0, Z(0)
-    mova     m2, Z(2)
-    T4_3DN   m0, m2, m4, m6, m5, m7
-    PUNPCK   m0, m1, m5
-    PUNPCK   m2, m3, m7
-    mova   Z(0), m0
-    mova   Z(1), m5
-    mova   Z(2), m2
-    mova   Z(3), m7
-    PUNPCK   m4, Z(5), m5
-    PUNPCK   m6, Z(7), m7
-    mova   Z(4), m4
-    mova   Z(5), m5
-    mova   Z(6), m6
-    mova   Z(7), m7
-    ret
-%endmacro
-
-FFT48_3DN _3dn2
-
-%macro pswapd 2
-%ifidn %1, %2
-    movd [r0+12], %1
-    punpckhdq %1, [r0+8]
-%else
-    movq  %1, %2
-    psrlq %1, 32
-    punpckldq %1, %2
-%endif
-%endmacro
-
-FFT48_3DN _3dn
-
-
-%define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
-
-%macro DECL_PASS 2+ ; name, payload
-align 16
-%1:
-DEFINE_ARGS z, w, n, o1, o3
-    lea o3q, [nq*3]
-    lea o1q, [nq*8]
-    shl o3q, 4
-.loop:
-    %2
-    add zq, mmsize*2
-    add wq, mmsize
-    sub nd, mmsize/8
-    jg .loop
-    rep ret
-%endmacro
-
-INIT_XMM
-DECL_PASS pass_sse, PASS_BIG 1
-DECL_PASS pass_interleave_sse, PASS_BIG 0
-
-INIT_MMX
-%define mulps pfmul
-%define addps pfadd
-%define subps pfsub
-%define unpcklps punpckldq
-%define unpckhps punpckhdq
-DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
-DECL_PASS pass_interleave_3dn, PASS_BIG 0
-%define pass_3dn2 pass_3dn
-%define pass_interleave_3dn2 pass_interleave_3dn
-
-
-%macro DECL_FFT 2-3 ; nbits, cpu, suffix
-%xdefine list_of_fft fft4%2, fft8%2
-%if %1==5
-%xdefine list_of_fft list_of_fft, fft16%2
-%endif
-
-%assign n 1<<%1
-%rep 17-%1
-%assign n2 n/2
-%assign n4 n/4
-%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2
-
-align 16
-fft %+ n %+ %3%2:
-    call fft %+ n2 %+ %2
-    add r0, n*4 - (n&(-2<<%1))
-    call fft %+ n4 %+ %2
-    add r0, n*2 - (n2&(-2<<%1))
-    call fft %+ n4 %+ %2
-    sub r0, n*6 + (n2&(-2<<%1))
-    lea r1, [ff_cos_ %+ n GLOBAL]
-    mov r2d, n4/2
-    jmp pass%3%2
-
-%assign n n*2
-%endrep
-%undef n
-
-align 8
-dispatch_tab%3%2: pointer list_of_fft
-
-; On x86_32, this function does the register saving and restoring for all of fft.
-; The others pass args in registers and don't spill anything.
-cglobal ff_fft_dispatch%3%2, 2,5,0, z, nbits
-    lea r2, [dispatch_tab%3%2 GLOBAL]
-    mov r2, [r2 + (nbitsq-2)*gprsize]
-    call r2
-    RET
-%endmacro ; DECL_FFT
-
-DECL_FFT 5, _sse
-DECL_FFT 5, _sse, _interleave
-DECL_FFT 4, _3dn
-DECL_FFT 4, _3dn, _interleave
-DECL_FFT 4, _3dn2
-DECL_FFT 4, _3dn2, _interleave
-
diff --git a/libavcodec/i386/h264_i386.h b/libavcodec/i386/h264_i386.h
deleted file mode 100644
index ed62dd6..0000000
--- a/libavcodec/i386/h264_i386.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
- * Copyright (c) 2003 Michael Niedermayer <michaelni at gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file h264_i386.h
- * H.264 / AVC / MPEG4 part10 codec.
- * non-MMX i386-specific optimizations for H.264
- * @author Michael Niedermayer <michaelni at gmx.at>
- */
-
-#ifndef AVCODEC_I386_H264_I386_H
-#define AVCODEC_I386_H264_I386_H
-
-#include "libavcodec/cabac.h"
-
-//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
-//as that would make optimization work hard)
-#if defined(ARCH_X86) && defined(HAVE_7REGS)     && \
-    defined(HAVE_EBX_AVAILABLE)                  && \
-    !defined(BROKEN_RELOCATIONS)
-static int decode_significance_x86(CABACContext *c, int max_coeff,
-                                   uint8_t *significant_coeff_ctx_base,
-                                   int *index){
-    void *end= significant_coeff_ctx_base + max_coeff - 1;
-    int minusstart= -(int)significant_coeff_ctx_base;
-    int minusindex= 4-(int)index;
-    int coeff_count;
-    __asm__ volatile(
-        "movl "RANGE    "(%3), %%esi            \n\t"
-        "movl "LOW      "(%3), %%ebx            \n\t"
-
-        "2:                                     \n\t"
-
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx",
-                             "%%bx", "%%esi", "%%eax", "%%al")
-
-        "test $1, %%edx                         \n\t"
-        " jz 3f                                 \n\t"
-
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx",
-                             "%%bx", "%%esi", "%%eax", "%%al")
-
-        "mov  %2, %%"REG_a"                     \n\t"
-        "movl %4, %%ecx                         \n\t"
-        "add  %1, %%"REG_c"                     \n\t"
-        "movl %%ecx, (%%"REG_a")                \n\t"
-
-        "test $1, %%edx                         \n\t"
-        " jnz 4f                                \n\t"
-
-        "add  $4, %%"REG_a"                     \n\t"
-        "mov  %%"REG_a", %2                     \n\t"
-
-        "3:                                     \n\t"
-        "add  $1, %1                            \n\t"
-        "cmp  %5, %1                            \n\t"
-        " jb 2b                                 \n\t"
-        "mov  %2, %%"REG_a"                     \n\t"
-        "movl %4, %%ecx                         \n\t"
-        "add  %1, %%"REG_c"                     \n\t"
-        "movl %%ecx, (%%"REG_a")                \n\t"
-        "4:                                     \n\t"
-        "add  %6, %%eax                         \n\t"
-        "shr $2, %%eax                          \n\t"
-
-        "movl %%esi, "RANGE    "(%3)            \n\t"
-        "movl %%ebx, "LOW      "(%3)            \n\t"
-        :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)
-        :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)
-        : "%"REG_c, "%ebx", "%edx", "%esi", "memory"
-    );
-    return coeff_count;
-}
-
-static int decode_significance_8x8_x86(CABACContext *c,
-                                       uint8_t *significant_coeff_ctx_base,
-                                       int *index, const uint8_t *sig_off){
-    int minusindex= 4-(int)index;
-    int coeff_count;
-    x86_reg last=0;
-    __asm__ volatile(
-        "movl "RANGE    "(%3), %%esi            \n\t"
-        "movl "LOW      "(%3), %%ebx            \n\t"
-
-        "mov %1, %%"REG_D"                      \n\t"
-        "2:                                     \n\t"
-
-        "mov %6, %%"REG_a"                      \n\t"
-        "movzbl (%%"REG_a", %%"REG_D"), %%edi   \n\t"
-        "add %5, %%"REG_D"                      \n\t"
-
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx",
-                             "%%bx", "%%esi", "%%eax", "%%al")
-
-        "mov %1, %%edi                          \n\t"
-        "test $1, %%edx                         \n\t"
-        " jz 3f                                 \n\t"
-
-        "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t"
-        "add %5, %%"REG_D"                      \n\t"
-
-        BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx",
-                             "%%bx", "%%esi", "%%eax", "%%al")
-
-        "mov %2, %%"REG_a"                      \n\t"
-        "mov %1, %%edi                          \n\t"
-        "movl %%edi, (%%"REG_a")                \n\t"
-
-        "test $1, %%edx                         \n\t"
-        " jnz 4f                                \n\t"
-
-        "add $4, %%"REG_a"                      \n\t"
-        "mov %%"REG_a", %2                      \n\t"
-
-        "3:                                     \n\t"
-        "addl $1, %%edi                         \n\t"
-        "mov %%edi, %1                          \n\t"
-        "cmpl $63, %%edi                        \n\t"
-        " jb 2b                                 \n\t"
-        "mov %2, %%"REG_a"                      \n\t"
-        "movl %%edi, (%%"REG_a")                \n\t"
-        "4:                                     \n\t"
-        "addl %4, %%eax                         \n\t"
-        "shr $2, %%eax                          \n\t"
-
-        "movl %%esi, "RANGE    "(%3)            \n\t"
-        "movl %%ebx, "LOW      "(%3)            \n\t"
-        :"=&a"(coeff_count),"+m"(last), "+m"(index)
-        :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)
-        : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"
-    );
-    return coeff_count;
-}
-#endif /* defined(ARCH_X86) && defined(HAVE_7REGS) &&                 */
-       /* defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS) */
-
-#endif /* AVCODEC_I386_H264_I386_H */
diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c
deleted file mode 100644
index bb9c82d..0000000
--- a/libavcodec/i386/h264dsp_mmx.c
+++ /dev/null
@@ -1,2113 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "dsputil_mmx.h"
-
-DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
-DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3  ) = 0x0307030703070307ULL;
-
-/***********************************/
-/* IDCT */
-
-#define SUMSUB_BADC( a, b, c, d ) \
-    "paddw "#b", "#a" \n\t"\
-    "paddw "#d", "#c" \n\t"\
-    "paddw "#b", "#b" \n\t"\
-    "paddw "#d", "#d" \n\t"\
-    "psubw "#a", "#b" \n\t"\
-    "psubw "#c", "#d" \n\t"
-
-#define SUMSUBD2_AB( a, b, t ) \
-    "movq  "#b", "#t" \n\t"\
-    "psraw  $1 , "#b" \n\t"\
-    "paddw "#a", "#b" \n\t"\
-    "psraw  $1 , "#a" \n\t"\
-    "psubw "#t", "#a" \n\t"
-
-#define IDCT4_1D( s02, s13, d02, d13, t ) \
-    SUMSUB_BA  ( s02, d02 )\
-    SUMSUBD2_AB( s13, d13, t )\
-    SUMSUB_BADC( d13, s02, s13, d02 )
-
-#define STORE_DIFF_4P( p, t, z ) \
-    "psraw      $6,     "#p" \n\t"\
-    "movd       (%0),   "#t" \n\t"\
-    "punpcklbw "#z",    "#t" \n\t"\
-    "paddsw    "#t",    "#p" \n\t"\
-    "packuswb  "#z",    "#p" \n\t"\
-    "movd      "#p",    (%0) \n\t"
-
-static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
-{
-    /* Load dct coeffs */
-    __asm__ volatile(
-        "movq   (%0), %%mm0 \n\t"
-        "movq  8(%0), %%mm1 \n\t"
-        "movq 16(%0), %%mm2 \n\t"
-        "movq 24(%0), %%mm3 \n\t"
-    :: "r"(block) );
-
-    __asm__ volatile(
-        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
-        IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
-
-        "movq      %0,    %%mm6 \n\t"
-        /* in: 1,4,0,2  out: 1,2,3,0 */
-        TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
-
-        "paddw     %%mm6, %%mm3 \n\t"
-
-        /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
-        IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
-
-        "pxor %%mm7, %%mm7    \n\t"
-    :: "m"(ff_pw_32));
-
-    __asm__ volatile(
-    STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
-        "add %1, %0             \n\t"
-    STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
-        "add %1, %0             \n\t"
-    STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
-        "add %1, %0             \n\t"
-    STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
-        : "+r"(dst)
-        : "r" ((x86_reg)stride)
-    );
-}
-
-static inline void h264_idct8_1d(int16_t *block)
-{
-    __asm__ volatile(
-        "movq 112(%0), %%mm7  \n\t"
-        "movq  80(%0), %%mm0  \n\t"
-        "movq  48(%0), %%mm3  \n\t"
-        "movq  16(%0), %%mm5  \n\t"
-
-        "movq   %%mm0, %%mm4  \n\t"
-        "movq   %%mm5, %%mm1  \n\t"
-        "psraw  $1,    %%mm4  \n\t"
-        "psraw  $1,    %%mm1  \n\t"
-        "paddw  %%mm0, %%mm4  \n\t"
-        "paddw  %%mm5, %%mm1  \n\t"
-        "paddw  %%mm7, %%mm4  \n\t"
-        "paddw  %%mm0, %%mm1  \n\t"
-        "psubw  %%mm5, %%mm4  \n\t"
-        "paddw  %%mm3, %%mm1  \n\t"
-
-        "psubw  %%mm3, %%mm5  \n\t"
-        "psubw  %%mm3, %%mm0  \n\t"
-        "paddw  %%mm7, %%mm5  \n\t"
-        "psubw  %%mm7, %%mm0  \n\t"
-        "psraw  $1,    %%mm3  \n\t"
-        "psraw  $1,    %%mm7  \n\t"
-        "psubw  %%mm3, %%mm5  \n\t"
-        "psubw  %%mm7, %%mm0  \n\t"
-
-        "movq   %%mm4, %%mm3  \n\t"
-        "movq   %%mm1, %%mm7  \n\t"
-        "psraw  $2,    %%mm1  \n\t"
-        "psraw  $2,    %%mm3  \n\t"
-        "paddw  %%mm5, %%mm3  \n\t"
-        "psraw  $2,    %%mm5  \n\t"
-        "paddw  %%mm0, %%mm1  \n\t"
-        "psraw  $2,    %%mm0  \n\t"
-        "psubw  %%mm4, %%mm5  \n\t"
-        "psubw  %%mm0, %%mm7  \n\t"
-
-        "movq  32(%0), %%mm2  \n\t"
-        "movq  96(%0), %%mm6  \n\t"
-        "movq   %%mm2, %%mm4  \n\t"
-        "movq   %%mm6, %%mm0  \n\t"
-        "psraw  $1,    %%mm4  \n\t"
-        "psraw  $1,    %%mm6  \n\t"
-        "psubw  %%mm0, %%mm4  \n\t"
-        "paddw  %%mm2, %%mm6  \n\t"
-
-        "movq    (%0), %%mm2  \n\t"
-        "movq  64(%0), %%mm0  \n\t"
-        SUMSUB_BA( %%mm0, %%mm2 )
-        SUMSUB_BA( %%mm6, %%mm0 )
-        SUMSUB_BA( %%mm4, %%mm2 )
-        SUMSUB_BA( %%mm7, %%mm6 )
-        SUMSUB_BA( %%mm5, %%mm4 )
-        SUMSUB_BA( %%mm3, %%mm2 )
-        SUMSUB_BA( %%mm1, %%mm0 )
-        :: "r"(block)
-    );
-}
-
-static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
-{
-    int i;
-    int16_t __attribute__ ((aligned(8))) b2[64];
-
-    block[0] += 32;
-
-    for(i=0; i<2; i++){
-        DECLARE_ALIGNED_8(uint64_t, tmp);
-
-        h264_idct8_1d(block+4*i);
-
-        __asm__ volatile(
-            "movq   %%mm7,    %0   \n\t"
-            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
-            "movq   %%mm0,  8(%1)  \n\t"
-            "movq   %%mm6, 24(%1)  \n\t"
-            "movq   %%mm7, 40(%1)  \n\t"
-            "movq   %%mm4, 56(%1)  \n\t"
-            "movq    %0,    %%mm7  \n\t"
-            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
-            "movq   %%mm7,   (%1)  \n\t"
-            "movq   %%mm1, 16(%1)  \n\t"
-            "movq   %%mm0, 32(%1)  \n\t"
-            "movq   %%mm3, 48(%1)  \n\t"
-            : "=m"(tmp)
-            : "r"(b2+32*i)
-            : "memory"
-        );
-    }
-
-    for(i=0; i<2; i++){
-        h264_idct8_1d(b2+4*i);
-
-        __asm__ volatile(
-            "psraw     $6, %%mm7  \n\t"
-            "psraw     $6, %%mm6  \n\t"
-            "psraw     $6, %%mm5  \n\t"
-            "psraw     $6, %%mm4  \n\t"
-            "psraw     $6, %%mm3  \n\t"
-            "psraw     $6, %%mm2  \n\t"
-            "psraw     $6, %%mm1  \n\t"
-            "psraw     $6, %%mm0  \n\t"
-
-            "movq   %%mm7,    (%0)  \n\t"
-            "movq   %%mm5,  16(%0)  \n\t"
-            "movq   %%mm3,  32(%0)  \n\t"
-            "movq   %%mm1,  48(%0)  \n\t"
-            "movq   %%mm0,  64(%0)  \n\t"
-            "movq   %%mm2,  80(%0)  \n\t"
-            "movq   %%mm4,  96(%0)  \n\t"
-            "movq   %%mm6, 112(%0)  \n\t"
-            :: "r"(b2+4*i)
-            : "memory"
-        );
-    }
-
-    add_pixels_clamped_mmx(b2, dst, stride);
-}
-
-#define STORE_DIFF_8P( p, d, t, z )\
-        "movq       "#d", "#t" \n"\
-        "psraw       $6,  "#p" \n"\
-        "punpcklbw  "#z", "#t" \n"\
-        "paddsw     "#t", "#p" \n"\
-        "packuswb   "#p", "#p" \n"\
-        "movq       "#p", "#d" \n"
-
-#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
-        "movdqa     "#c", "#a" \n"\
-        "movdqa     "#g", "#e" \n"\
-        "psraw       $1,  "#c" \n"\
-        "psraw       $1,  "#g" \n"\
-        "psubw      "#e", "#c" \n"\
-        "paddw      "#a", "#g" \n"\
-        "movdqa     "#b", "#e" \n"\
-        "psraw       $1,  "#e" \n"\
-        "paddw      "#b", "#e" \n"\
-        "paddw      "#d", "#e" \n"\
-        "paddw      "#f", "#e" \n"\
-        "movdqa     "#f", "#a" \n"\
-        "psraw       $1,  "#a" \n"\
-        "paddw      "#f", "#a" \n"\
-        "paddw      "#h", "#a" \n"\
-        "psubw      "#b", "#a" \n"\
-        "psubw      "#d", "#b" \n"\
-        "psubw      "#d", "#f" \n"\
-        "paddw      "#h", "#b" \n"\
-        "psubw      "#h", "#f" \n"\
-        "psraw       $1,  "#d" \n"\
-        "psraw       $1,  "#h" \n"\
-        "psubw      "#d", "#b" \n"\
-        "psubw      "#h", "#f" \n"\
-        "movdqa     "#e", "#d" \n"\
-        "movdqa     "#a", "#h" \n"\
-        "psraw       $2,  "#d" \n"\
-        "psraw       $2,  "#h" \n"\
-        "paddw      "#f", "#d" \n"\
-        "paddw      "#b", "#h" \n"\
-        "psraw       $2,  "#f" \n"\
-        "psraw       $2,  "#b" \n"\
-        "psubw      "#f", "#e" \n"\
-        "psubw      "#a", "#b" \n"\
-        "movdqa 0x00(%1), "#a" \n"\
-        "movdqa 0x40(%1), "#f" \n"\
-        SUMSUB_BA(f, a)\
-        SUMSUB_BA(g, f)\
-        SUMSUB_BA(c, a)\
-        SUMSUB_BA(e, g)\
-        SUMSUB_BA(b, c)\
-        SUMSUB_BA(h, a)\
-        SUMSUB_BA(d, f)
-
-static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
-{
-    __asm__ volatile(
-        "movdqa   0x10(%1), %%xmm1 \n"
-        "movdqa   0x20(%1), %%xmm2 \n"
-        "movdqa   0x30(%1), %%xmm3 \n"
-        "movdqa   0x50(%1), %%xmm5 \n"
-        "movdqa   0x60(%1), %%xmm6 \n"
-        "movdqa   0x70(%1), %%xmm7 \n"
-        H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
-        TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
-        "paddw          %4, %%xmm4 \n"
-        "movdqa     %%xmm4, 0x00(%1) \n"
-        "movdqa     %%xmm2, 0x40(%1) \n"
-        H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
-        "movdqa     %%xmm6, 0x60(%1) \n"
-        "movdqa     %%xmm7, 0x70(%1) \n"
-        "pxor       %%xmm7, %%xmm7 \n"
-        STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
-        STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
-        STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
-        STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
-        "lea     (%0,%2,4), %0 \n"
-        STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
-        STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
-        "movdqa   0x60(%1), %%xmm0 \n"
-        "movdqa   0x70(%1), %%xmm1 \n"
-        STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
-        STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
-        :"+r"(dst)
-        :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
-    );
-}
-
-static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
-{
-    int dc = (block[0] + 32) >> 6;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    __asm__ volatile(
-        "movd          %0, %%mm2 \n\t"
-        "movd          %1, %%mm3 \n\t"
-        "movd          %2, %%mm4 \n\t"
-        "movd          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movd       %%mm2, %0    \n\t"
-        "movd       %%mm3, %1    \n\t"
-        "movd       %%mm4, %2    \n\t"
-        "movd       %%mm5, %3    \n\t"
-        :"+m"(*(uint32_t*)(dst+0*stride)),
-         "+m"(*(uint32_t*)(dst+1*stride)),
-         "+m"(*(uint32_t*)(dst+2*stride)),
-         "+m"(*(uint32_t*)(dst+3*stride))
-    );
-}
-
-static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
-{
-    int dc = (block[0] + 32) >> 6;
-    int y;
-    __asm__ volatile(
-        "movd          %0, %%mm0 \n\t"
-        "pshufw $0, %%mm0, %%mm0 \n\t"
-        "pxor       %%mm1, %%mm1 \n\t"
-        "psubw      %%mm0, %%mm1 \n\t"
-        "packuswb   %%mm0, %%mm0 \n\t"
-        "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
-    );
-    for(y=2; y--; dst += 4*stride){
-    __asm__ volatile(
-        "movq          %0, %%mm2 \n\t"
-        "movq          %1, %%mm3 \n\t"
-        "movq          %2, %%mm4 \n\t"
-        "movq          %3, %%mm5 \n\t"
-        "paddusb    %%mm0, %%mm2 \n\t"
-        "paddusb    %%mm0, %%mm3 \n\t"
-        "paddusb    %%mm0, %%mm4 \n\t"
-        "paddusb    %%mm0, %%mm5 \n\t"
-        "psubusb    %%mm1, %%mm2 \n\t"
-        "psubusb    %%mm1, %%mm3 \n\t"
-        "psubusb    %%mm1, %%mm4 \n\t"
-        "psubusb    %%mm1, %%mm5 \n\t"
-        "movq       %%mm2, %0    \n\t"
-        "movq       %%mm3, %1    \n\t"
-        "movq       %%mm4, %2    \n\t"
-        "movq       %%mm5, %3    \n\t"
-        :"+m"(*(uint64_t*)(dst+0*stride)),
-         "+m"(*(uint64_t*)(dst+1*stride)),
-         "+m"(*(uint64_t*)(dst+2*stride)),
-         "+m"(*(uint64_t*)(dst+3*stride))
-    );
-    }
-}
-
-
-/***********************************/
-/* deblocking */
-
-// out: o = |x-y|>a
-// clobbers: t
-#define DIFF_GT_MMX(x,y,a,o,t)\
-    "movq     "#y", "#t"  \n\t"\
-    "movq     "#x", "#o"  \n\t"\
-    "psubusb  "#x", "#t"  \n\t"\
-    "psubusb  "#y", "#o"  \n\t"\
-    "por      "#t", "#o"  \n\t"\
-    "psubusb  "#a", "#o"  \n\t"
-
-// out: o = |x-y|>a
-// clobbers: t
-#define DIFF_GT2_MMX(x,y,a,o,t)\
-    "movq     "#y", "#t"  \n\t"\
-    "movq     "#x", "#o"  \n\t"\
-    "psubusb  "#x", "#t"  \n\t"\
-    "psubusb  "#y", "#o"  \n\t"\
-    "psubusb  "#a", "#t"  \n\t"\
-    "psubusb  "#a", "#o"  \n\t"\
-    "pcmpeqb  "#t", "#o"  \n\t"\
-
-// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
-// out: mm5=beta-1, mm7=mask
-// clobbers: mm4,mm6
-#define H264_DEBLOCK_MASK(alpha1, beta1) \
-    "pshufw $0, "#alpha1", %%mm4 \n\t"\
-    "pshufw $0, "#beta1 ", %%mm5 \n\t"\
-    "packuswb  %%mm4, %%mm4      \n\t"\
-    "packuswb  %%mm5, %%mm5      \n\t"\
-    DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
-    DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
-    "por       %%mm4, %%mm7      \n\t"\
-    DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
-    "por       %%mm4, %%mm7      \n\t"\
-    "pxor      %%mm6, %%mm6      \n\t"\
-    "pcmpeqb   %%mm6, %%mm7      \n\t"
-
-// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
-// out: mm1=p0' mm2=q0'
-// clobbers: mm0,3-6
-#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
-        "movq    %%mm1              , %%mm5 \n\t"\
-        "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
-        "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
-        "pcmpeqb %%mm4              , %%mm4 \n\t"\
-        "pxor    %%mm4              , %%mm3 \n\t"\
-        "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
-        "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
-        "pxor    %%mm1              , %%mm4 \n\t"\
-        "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
-        "pavgb   %%mm5              , %%mm3 \n\t"\
-        "paddusb %%mm4              , %%mm3 \n\t" /* d+128+33*/\
-        "movq    "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
-        "psubusb %%mm3              , %%mm6 \n\t"\
-        "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
-        "pminub  %%mm7              , %%mm6 \n\t"\
-        "pminub  %%mm7              , %%mm3 \n\t"\
-        "psubusb %%mm6              , %%mm1 \n\t"\
-        "psubusb %%mm3              , %%mm2 \n\t"\
-        "paddusb %%mm3              , %%mm1 \n\t"\
-        "paddusb %%mm6              , %%mm2 \n\t"
-
-// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
-// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
-// clobbers: q2, tmp, tc0
-#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
-        "movq     %%mm1,  "#tmp"   \n\t"\
-        "pavgb    %%mm2,  "#tmp"   \n\t"\
-        "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
-        "pxor   "q2addr", "#tmp"   \n\t"\
-        "pand     %8,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
-        "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
-        "movq     "#p1",  "#tmp"   \n\t"\
-        "psubusb  "#tc0", "#tmp"   \n\t"\
-        "paddusb  "#p1",  "#tc0"   \n\t"\
-        "pmaxub   "#tmp", "#q2"    \n\t"\
-        "pminub   "#tc0", "#q2"    \n\t"\
-        "movq     "#q2",  "q1addr" \n\t"
-
-static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
-{
-    DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
-
-    __asm__ volatile(
-        "movq    (%1,%3), %%mm0    \n\t" //p1
-        "movq    (%1,%3,2), %%mm1  \n\t" //p0
-        "movq    (%2),    %%mm2    \n\t" //q0
-        "movq    (%2,%3), %%mm3    \n\t" //q1
-        H264_DEBLOCK_MASK(%6, %7)
-
-        "movd      %5,    %%mm4    \n\t"
-        "punpcklbw %%mm4, %%mm4    \n\t"
-        "punpcklwd %%mm4, %%mm4    \n\t"
-        "pcmpeqb   %%mm3, %%mm3    \n\t"
-        "movq      %%mm4, %%mm6    \n\t"
-        "pcmpgtb   %%mm3, %%mm4    \n\t"
-        "movq      %%mm6, 8+%0     \n\t"
-        "pand      %%mm4, %%mm7    \n\t"
-        "movq      %%mm7, %0       \n\t"
-
-        /* filter p1 */
-        "movq     (%1),   %%mm3    \n\t" //p2
-        DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
-        "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
-        "pand     8+%0,   %%mm7    \n\t" // mask & tc0
-        "movq     %%mm7,  %%mm4    \n\t"
-        "psubb    %%mm6,  %%mm7    \n\t"
-        "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
-        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
-
-        /* filter q1 */
-        "movq    (%2,%3,2), %%mm4  \n\t" //q2
-        DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
-        "pand     %0,     %%mm6    \n\t"
-        "movq     8+%0,   %%mm5    \n\t" // can be merged with the and below but is slower then
-        "pand     %%mm6,  %%mm5    \n\t"
-        "psubb    %%mm6,  %%mm7    \n\t"
-        "movq    (%2,%3), %%mm3    \n\t"
-        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
-
-        /* filter p0, q0 */
-        H264_DEBLOCK_P0_Q0(%8, unused)
-        "movq      %%mm1, (%1,%3,2) \n\t"
-        "movq      %%mm2, (%2)      \n\t"
-
-        : "=m"(*tmp0)
-        : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
-          "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
-          "m"(ff_bone)
-    );
-}
-
-static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    if((tc0[0] & tc0[1]) >= 0)
-        h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
-    if((tc0[2] & tc0[3]) >= 0)
-        h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
-}
-static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    //FIXME: could cut some load/stores by merging transpose with filter
-    // also, it only needs to transpose 6x8
-    DECLARE_ALIGNED_8(uint8_t, trans[8*8]);
-    int i;
-    for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
-        if((tc0[0] & tc0[1]) < 0)
-            continue;
-        transpose4x4(trans,       pix-4,          8, stride);
-        transpose4x4(trans  +4*8, pix,            8, stride);
-        transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
-        transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
-        h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
-        transpose4x4(pix-2,          trans  +2*8, stride, 8);
-        transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
-    }
-}
-
-static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
-{
-    __asm__ volatile(
-        "movq    (%0),    %%mm0     \n\t" //p1
-        "movq    (%0,%2), %%mm1     \n\t" //p0
-        "movq    (%1),    %%mm2     \n\t" //q0
-        "movq    (%1,%2), %%mm3     \n\t" //q1
-        H264_DEBLOCK_MASK(%4, %5)
-        "movd      %3,    %%mm6     \n\t"
-        "punpcklbw %%mm6, %%mm6     \n\t"
-        "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
-        H264_DEBLOCK_P0_Q0(%6, %7)
-        "movq      %%mm1, (%0,%2)   \n\t"
-        "movq      %%mm2, (%1)      \n\t"
-
-        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
-           "r"(*(uint32_t*)tc0),
-           "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
-    );
-}
-
-static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
-}
-
-static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
-    //FIXME: could cut some load/stores by merging transpose with filter
-    DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
-    transpose4x4(trans, pix-2, 8, stride);
-    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
-    h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
-    transpose4x4(pix-2, trans, stride, 8);
-    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
-}
-
-// p0 = (p0 + q1 + 2*p1 + 2) >> 2
-#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
-    "movq    "#p0", %%mm4  \n\t"\
-    "pxor    "#q1", %%mm4  \n\t"\
-    "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
-    "pavgb   "#q1", "#p0"  \n\t"\
-    "psubusb %%mm4, "#p0"  \n\t"\
-    "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
-
-static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
-{
-    __asm__ volatile(
-        "movq    (%0),    %%mm0     \n\t"
-        "movq    (%0,%2), %%mm1     \n\t"
-        "movq    (%1),    %%mm2     \n\t"
-        "movq    (%1,%2), %%mm3     \n\t"
-        H264_DEBLOCK_MASK(%3, %4)
-        "movq    %%mm1,   %%mm5     \n\t"
-        "movq    %%mm2,   %%mm6     \n\t"
-        H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
-        H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
-        "psubb   %%mm5,   %%mm1     \n\t"
-        "psubb   %%mm6,   %%mm2     \n\t"
-        "pand    %%mm7,   %%mm1     \n\t"
-        "pand    %%mm7,   %%mm2     \n\t"
-        "paddb   %%mm5,   %%mm1     \n\t"
-        "paddb   %%mm6,   %%mm2     \n\t"
-        "movq    %%mm1,   (%0,%2)   \n\t"
-        "movq    %%mm2,   (%1)      \n\t"
-        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
-           "m"(alpha1), "m"(beta1), "m"(ff_bone)
-    );
-}
-
-static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
-{
-    h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
-}
-
-static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
-{
-    //FIXME: could cut some load/stores by merging transpose with filter
-    DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
-    transpose4x4(trans, pix-2, 8, stride);
-    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
-    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
-    transpose4x4(pix-2, trans, stride, 8);
-    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
-}
-
-static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
-                                            int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
-    int dir;
-    __asm__ volatile(
-        "pxor %%mm7, %%mm7 \n\t"
-        "movq %0, %%mm6 \n\t"
-        "movq %1, %%mm5 \n\t"
-        "movq %2, %%mm4 \n\t"
-        ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
-    );
-    if(field)
-        __asm__ volatile(
-            "movq %0, %%mm5 \n\t"
-            "movq %1, %%mm4 \n\t"
-            ::"m"(ff_pb_3_1), "m"(ff_pb_7_3)
-        );
-
-    // could do a special case for dir==0 && edges==1, but it only reduces the
-    // average filter time by 1.2%
-    for( dir=1; dir>=0; dir-- ) {
-        const int d_idx = dir ? -8 : -1;
-        const int mask_mv = dir ? mask_mv1 : mask_mv0;
-        DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
-        int b_idx, edge, l;
-        for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
-            __asm__ volatile(
-                "pand %0, %%mm0 \n\t"
-                ::"m"(mask_dir)
-            );
-            if(!(mask_mv & edge)) {
-                __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
-                for( l = bidir; l >= 0; l-- ) {
-                    __asm__ volatile(
-                        "movd %0, %%mm1 \n\t"
-                        "punpckldq %1, %%mm1 \n\t"
-                        "movq %%mm1, %%mm2 \n\t"
-                        "psrlw $7, %%mm2 \n\t"
-                        "pand %%mm6, %%mm2 \n\t"
-                        "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1
-                        "punpckldq %%mm1, %%mm2 \n\t"
-                        "pcmpeqb %%mm2, %%mm1 \n\t"
-                        "paddb %%mm6, %%mm1 \n\t"
-                        "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
-                        "por %%mm1, %%mm0 \n\t"
-
-                        "movq %2, %%mm1 \n\t"
-                        "movq %3, %%mm2 \n\t"
-                        "psubw %4, %%mm1 \n\t"
-                        "psubw %5, %%mm2 \n\t"
-                        "packsswb %%mm2, %%mm1 \n\t"
-                        "paddb %%mm5, %%mm1 \n\t"
-                        "pminub %%mm4, %%mm1 \n\t"
-                        "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
-                        "por %%mm1, %%mm0 \n\t"
-                        ::"m"(ref[l][b_idx]),
-                          "m"(ref[l][b_idx+d_idx]),
-                          "m"(mv[l][b_idx][0]),
-                          "m"(mv[l][b_idx+2][0]),
-                          "m"(mv[l][b_idx+d_idx][0]),
-                          "m"(mv[l][b_idx+d_idx+2][0])
-                    );
-                }
-            }
-            __asm__ volatile(
-                "movd %0, %%mm1 \n\t"
-                "por  %1, %%mm1 \n\t"
-                "punpcklbw %%mm7, %%mm1 \n\t"
-                "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn]
-                ::"m"(nnz[b_idx]),
-                  "m"(nnz[b_idx+d_idx])
-            );
-            __asm__ volatile(
-                "pcmpeqw %%mm7, %%mm0 \n\t"
-                "pcmpeqw %%mm7, %%mm0 \n\t"
-                "psrlw $15, %%mm0 \n\t" // nonzero -> 1
-                "psrlw $14, %%mm1 \n\t"
-                "movq %%mm0, %%mm2 \n\t"
-                "por %%mm1, %%mm2 \n\t"
-                "psrlw $1, %%mm1 \n\t"
-                "pandn %%mm2, %%mm1 \n\t"
-                "movq %%mm1, %0 \n\t"
-                :"=m"(*bS[dir][edge])
-                ::"memory"
-            );
-        }
-        edges = 4;
-        step = 1;
-    }
-    __asm__ volatile(
-        "movq   (%0), %%mm0 \n\t"
-        "movq  8(%0), %%mm1 \n\t"
-        "movq 16(%0), %%mm2 \n\t"
-        "movq 24(%0), %%mm3 \n\t"
-        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
-        "movq %%mm0,   (%0) \n\t"
-        "movq %%mm3,  8(%0) \n\t"
-        "movq %%mm4, 16(%0) \n\t"
-        "movq %%mm2, 24(%0) \n\t"
-        ::"r"(bS[0])
-        :"memory"
-    );
-}
-
-/***********************************/
-/* motion compensation */
-
-#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
-        "mov"#q" "#C", "#T"         \n\t"\
-        "mov"#d" (%0), "#F"         \n\t"\
-        "paddw "#D", "#T"           \n\t"\
-        "psllw $2, "#T"             \n\t"\
-        "psubw "#B", "#T"           \n\t"\
-        "psubw "#E", "#T"           \n\t"\
-        "punpcklbw "#Z", "#F"       \n\t"\
-        "pmullw %4, "#T"            \n\t"\
-        "paddw %5, "#A"             \n\t"\
-        "add %2, %0                 \n\t"\
-        "paddw "#F", "#A"           \n\t"\
-        "paddw "#A", "#T"           \n\t"\
-        "psraw $5, "#T"             \n\t"\
-        "packuswb "#T", "#T"        \n\t"\
-        OP(T, (%1), A, d)\
-        "add %3, %1                 \n\t"
-
-#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
-        "mov"#q" "#C", "#T"         \n\t"\
-        "mov"#d" (%0), "#F"         \n\t"\
-        "paddw "#D", "#T"           \n\t"\
-        "psllw $2, "#T"             \n\t"\
-        "paddw %4, "#A"             \n\t"\
-        "psubw "#B", "#T"           \n\t"\
-        "psubw "#E", "#T"           \n\t"\
-        "punpcklbw "#Z", "#F"       \n\t"\
-        "pmullw %3, "#T"            \n\t"\
-        "paddw "#F", "#A"           \n\t"\
-        "add %2, %0                 \n\t"\
-        "paddw "#A", "#T"           \n\t"\
-        "mov"#q" "#T", "#OF"(%1)    \n\t"
-
-#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
-#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
-#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
-#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
-
-
-#define QPEL_H264(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    int h=4;\
-\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "movq %5, %%mm4             \n\t"\
-        "movq %6, %%mm5             \n\t"\
-        "1:                         \n\t"\
-        "movd  -1(%0), %%mm1        \n\t"\
-        "movd    (%0), %%mm2        \n\t"\
-        "movd   1(%0), %%mm3        \n\t"\
-        "movd   2(%0), %%mm0        \n\t"\
-        "punpcklbw %%mm7, %%mm1     \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpcklbw %%mm7, %%mm3     \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "paddw %%mm0, %%mm1         \n\t"\
-        "paddw %%mm3, %%mm2         \n\t"\
-        "movd  -2(%0), %%mm0        \n\t"\
-        "movd   3(%0), %%mm3        \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "punpcklbw %%mm7, %%mm3     \n\t"\
-        "paddw %%mm3, %%mm0         \n\t"\
-        "psllw $2, %%mm2            \n\t"\
-        "psubw %%mm1, %%mm2         \n\t"\
-        "pmullw %%mm4, %%mm2        \n\t"\
-        "paddw %%mm5, %%mm0         \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "psraw $5, %%mm0            \n\t"\
-        "packuswb %%mm0, %%mm0      \n\t"\
-        OP(%%mm0, (%1),%%mm6, d)\
-        "add %3, %0                 \n\t"\
-        "add %4, %1                 \n\t"\
-        "decl %2                    \n\t"\
-        " jnz 1b                    \n\t"\
-        : "+a"(src), "+c"(dst), "+g"(h)\
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
-    );\
-}\
-static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    int h=4;\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "movq %0, %%mm4             \n\t"\
-        "movq %1, %%mm5             \n\t"\
-        :: "m"(ff_pw_5), "m"(ff_pw_16)\
-    );\
-    do{\
-    __asm__ volatile(\
-        "movd  -1(%0), %%mm1        \n\t"\
-        "movd    (%0), %%mm2        \n\t"\
-        "movd   1(%0), %%mm3        \n\t"\
-        "movd   2(%0), %%mm0        \n\t"\
-        "punpcklbw %%mm7, %%mm1     \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpcklbw %%mm7, %%mm3     \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "paddw %%mm0, %%mm1         \n\t"\
-        "paddw %%mm3, %%mm2         \n\t"\
-        "movd  -2(%0), %%mm0        \n\t"\
-        "movd   3(%0), %%mm3        \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "punpcklbw %%mm7, %%mm3     \n\t"\
-        "paddw %%mm3, %%mm0         \n\t"\
-        "psllw $2, %%mm2            \n\t"\
-        "psubw %%mm1, %%mm2         \n\t"\
-        "pmullw %%mm4, %%mm2        \n\t"\
-        "paddw %%mm5, %%mm0         \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "movd   (%2), %%mm3         \n\t"\
-        "psraw $5, %%mm0            \n\t"\
-        "packuswb %%mm0, %%mm0      \n\t"\
-        PAVGB" %%mm3, %%mm0         \n\t"\
-        OP(%%mm0, (%1),%%mm6, d)\
-        "add %4, %0                 \n\t"\
-        "add %4, %1                 \n\t"\
-        "add %3, %2                 \n\t"\
-        : "+a"(src), "+c"(dst), "+d"(src2)\
-        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
-        : "memory"\
-    );\
-    }while(--h);\
-}\
-static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    src -= 2*srcStride;\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "movd (%0), %%mm0           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm1           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm2           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm3           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm4           \n\t"\
-        "add %2, %0                 \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "punpcklbw %%mm7, %%mm1     \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpcklbw %%mm7, %%mm3     \n\t"\
-        "punpcklbw %%mm7, %%mm4     \n\t"\
-        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
-        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
-        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
-        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
-         \
-        : "+a"(src), "+c"(dst)\
-        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
-    );\
-}\
-static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    int h=4;\
-    int w=3;\
-    src -= 2*srcStride+2;\
-    while(w--){\
-        __asm__ volatile(\
-            "pxor %%mm7, %%mm7      \n\t"\
-            "movd (%0), %%mm0       \n\t"\
-            "add %2, %0             \n\t"\
-            "movd (%0), %%mm1       \n\t"\
-            "add %2, %0             \n\t"\
-            "movd (%0), %%mm2       \n\t"\
-            "add %2, %0             \n\t"\
-            "movd (%0), %%mm3       \n\t"\
-            "add %2, %0             \n\t"\
-            "movd (%0), %%mm4       \n\t"\
-            "add %2, %0             \n\t"\
-            "punpcklbw %%mm7, %%mm0 \n\t"\
-            "punpcklbw %%mm7, %%mm1 \n\t"\
-            "punpcklbw %%mm7, %%mm2 \n\t"\
-            "punpcklbw %%mm7, %%mm3 \n\t"\
-            "punpcklbw %%mm7, %%mm4 \n\t"\
-            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
-            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
-            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
-            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
-             \
-            : "+a"(src)\
-            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-            : "memory"\
-        );\
-        tmp += 4;\
-        src += 4 - 9*srcStride;\
-    }\
-    tmp -= 3*4;\
-    __asm__ volatile(\
-        "1:                         \n\t"\
-        "movq     (%0), %%mm0       \n\t"\
-        "paddw  10(%0), %%mm0       \n\t"\
-        "movq    2(%0), %%mm1       \n\t"\
-        "paddw   8(%0), %%mm1       \n\t"\
-        "movq    4(%0), %%mm2       \n\t"\
-        "paddw   6(%0), %%mm2       \n\t"\
-        "psubw %%mm1, %%mm0         \n\t"/*a-b   (abccba)*/\
-        "psraw $2, %%mm0            \n\t"/*(a-b)/4 */\
-        "psubw %%mm1, %%mm0         \n\t"/*(a-b)/4-b */\
-        "paddsw %%mm2, %%mm0        \n\t"\
-        "psraw $2, %%mm0            \n\t"/*((a-b)/4-b+c)/4 */\
-        "paddw %%mm2, %%mm0         \n\t"/*(a-5*b+20*c)/16 */\
-        "psraw $6, %%mm0            \n\t"\
-        "packuswb %%mm0, %%mm0      \n\t"\
-        OP(%%mm0, (%1),%%mm7, d)\
-        "add $24, %0                \n\t"\
-        "add %3, %1                 \n\t"\
-        "decl %2                    \n\t"\
-        " jnz 1b                    \n\t"\
-        : "+a"(tmp), "+c"(dst), "+g"(h)\
-        : "S"((x86_reg)dstStride)\
-        : "memory"\
-    );\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    int h=8;\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "movq %5, %%mm6             \n\t"\
-        "1:                         \n\t"\
-        "movq    (%0), %%mm0        \n\t"\
-        "movq   1(%0), %%mm2        \n\t"\
-        "movq %%mm0, %%mm1          \n\t"\
-        "movq %%mm2, %%mm3          \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "punpckhbw %%mm7, %%mm1     \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpckhbw %%mm7, %%mm3     \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "paddw %%mm3, %%mm1         \n\t"\
-        "psllw $2, %%mm0            \n\t"\
-        "psllw $2, %%mm1            \n\t"\
-        "movq   -1(%0), %%mm2       \n\t"\
-        "movq    2(%0), %%mm4       \n\t"\
-        "movq %%mm2, %%mm3          \n\t"\
-        "movq %%mm4, %%mm5          \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpckhbw %%mm7, %%mm3     \n\t"\
-        "punpcklbw %%mm7, %%mm4     \n\t"\
-        "punpckhbw %%mm7, %%mm5     \n\t"\
-        "paddw %%mm4, %%mm2         \n\t"\
-        "paddw %%mm3, %%mm5         \n\t"\
-        "psubw %%mm2, %%mm0         \n\t"\
-        "psubw %%mm5, %%mm1         \n\t"\
-        "pmullw %%mm6, %%mm0        \n\t"\
-        "pmullw %%mm6, %%mm1        \n\t"\
-        "movd   -2(%0), %%mm2       \n\t"\
-        "movd    7(%0), %%mm5       \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpcklbw %%mm7, %%mm5     \n\t"\
-        "paddw %%mm3, %%mm2         \n\t"\
-        "paddw %%mm5, %%mm4         \n\t"\
-        "movq %6, %%mm5             \n\t"\
-        "paddw %%mm5, %%mm2         \n\t"\
-        "paddw %%mm5, %%mm4         \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "paddw %%mm4, %%mm1         \n\t"\
-        "psraw $5, %%mm0            \n\t"\
-        "psraw $5, %%mm1            \n\t"\
-        "packuswb %%mm1, %%mm0      \n\t"\
-        OP(%%mm0, (%1),%%mm5, q)\
-        "add %3, %0                 \n\t"\
-        "add %4, %1                 \n\t"\
-        "decl %2                    \n\t"\
-        " jnz 1b                    \n\t"\
-        : "+a"(src), "+c"(dst), "+g"(h)\
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
-    );\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    int h=8;\
-    __asm__ volatile(\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "movq %0, %%mm6             \n\t"\
-        :: "m"(ff_pw_5)\
-    );\
-    do{\
-    __asm__ volatile(\
-        "movq    (%0), %%mm0        \n\t"\
-        "movq   1(%0), %%mm2        \n\t"\
-        "movq %%mm0, %%mm1          \n\t"\
-        "movq %%mm2, %%mm3          \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "punpckhbw %%mm7, %%mm1     \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpckhbw %%mm7, %%mm3     \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "paddw %%mm3, %%mm1         \n\t"\
-        "psllw $2, %%mm0            \n\t"\
-        "psllw $2, %%mm1            \n\t"\
-        "movq   -1(%0), %%mm2       \n\t"\
-        "movq    2(%0), %%mm4       \n\t"\
-        "movq %%mm2, %%mm3          \n\t"\
-        "movq %%mm4, %%mm5          \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpckhbw %%mm7, %%mm3     \n\t"\
-        "punpcklbw %%mm7, %%mm4     \n\t"\
-        "punpckhbw %%mm7, %%mm5     \n\t"\
-        "paddw %%mm4, %%mm2         \n\t"\
-        "paddw %%mm3, %%mm5         \n\t"\
-        "psubw %%mm2, %%mm0         \n\t"\
-        "psubw %%mm5, %%mm1         \n\t"\
-        "pmullw %%mm6, %%mm0        \n\t"\
-        "pmullw %%mm6, %%mm1        \n\t"\
-        "movd   -2(%0), %%mm2       \n\t"\
-        "movd    7(%0), %%mm5       \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpcklbw %%mm7, %%mm5     \n\t"\
-        "paddw %%mm3, %%mm2         \n\t"\
-        "paddw %%mm5, %%mm4         \n\t"\
-        "movq %5, %%mm5             \n\t"\
-        "paddw %%mm5, %%mm2         \n\t"\
-        "paddw %%mm5, %%mm4         \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "paddw %%mm4, %%mm1         \n\t"\
-        "psraw $5, %%mm0            \n\t"\
-        "psraw $5, %%mm1            \n\t"\
-        "movq (%2), %%mm4           \n\t"\
-        "packuswb %%mm1, %%mm0      \n\t"\
-        PAVGB" %%mm4, %%mm0         \n\t"\
-        OP(%%mm0, (%1),%%mm5, q)\
-        "add %4, %0                 \n\t"\
-        "add %4, %1                 \n\t"\
-        "add %3, %2                 \n\t"\
-        : "+a"(src), "+c"(dst), "+d"(src2)\
-        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
-          "m"(ff_pw_16)\
-        : "memory"\
-    );\
-    }while(--h);\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-    int w= 2;\
-    src -= 2*srcStride;\
-    \
-    while(w--){\
-      __asm__ volatile(\
-        "pxor %%mm7, %%mm7          \n\t"\
-        "movd (%0), %%mm0           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm1           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm2           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm3           \n\t"\
-        "add %2, %0                 \n\t"\
-        "movd (%0), %%mm4           \n\t"\
-        "add %2, %0                 \n\t"\
-        "punpcklbw %%mm7, %%mm0     \n\t"\
-        "punpcklbw %%mm7, %%mm1     \n\t"\
-        "punpcklbw %%mm7, %%mm2     \n\t"\
-        "punpcklbw %%mm7, %%mm3     \n\t"\
-        "punpcklbw %%mm7, %%mm4     \n\t"\
-        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
-        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
-        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
-        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
-        QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
-        QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
-        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
-        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
-         \
-        : "+a"(src), "+c"(dst)\
-        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
-     );\
-     if(h==16){\
-        __asm__ volatile(\
-            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
-            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
-            QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
-            QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
-            QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
-            QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
-            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
-            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
-            \
-           : "+a"(src), "+c"(dst)\
-           : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-           : "memory"\
-        );\
-     }\
-     src += 4-(h+5)*srcStride;\
-     dst += 4-h*dstStride;\
-   }\
-}\
-static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
-    int w = (size+8)>>2;\
-    src -= 2*srcStride+2;\
-    while(w--){\
-        __asm__ volatile(\
-            "pxor %%mm7, %%mm7      \n\t"\
-            "movd (%0), %%mm0       \n\t"\
-            "add %2, %0             \n\t"\
-            "movd (%0), %%mm1       \n\t"\
-            "add %2, %0             \n\t"\
-            "movd (%0), %%mm2       \n\t"\
-            "add %2, %0             \n\t"\
-            "movd (%0), %%mm3       \n\t"\
-            "add %2, %0             \n\t"\
-            "movd (%0), %%mm4       \n\t"\
-            "add %2, %0             \n\t"\
-            "punpcklbw %%mm7, %%mm0 \n\t"\
-            "punpcklbw %%mm7, %%mm1 \n\t"\
-            "punpcklbw %%mm7, %%mm2 \n\t"\
-            "punpcklbw %%mm7, %%mm3 \n\t"\
-            "punpcklbw %%mm7, %%mm4 \n\t"\
-            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
-            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
-            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
-            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
-            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
-            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
-            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
-            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
-            : "+a"(src)\
-            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-            : "memory"\
-        );\
-        if(size==16){\
-            __asm__ volatile(\
-                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1,  8*48)\
-                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2,  9*48)\
-                QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
-                QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
-                QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
-                QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
-                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
-                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
-                : "+a"(src)\
-                : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-                : "memory"\
-            );\
-        }\
-        tmp += 4;\
-        src += 4 - (size+5)*srcStride;\
-    }\
-}\
-static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
-    int w = size>>4;\
-    do{\
-    int h = size;\
-    __asm__ volatile(\
-        "1:                         \n\t"\
-        "movq     (%0), %%mm0       \n\t"\
-        "movq    8(%0), %%mm3       \n\t"\
-        "movq    2(%0), %%mm1       \n\t"\
-        "movq   10(%0), %%mm4       \n\t"\
-        "paddw   %%mm4, %%mm0       \n\t"\
-        "paddw   %%mm3, %%mm1       \n\t"\
-        "paddw  18(%0), %%mm3       \n\t"\
-        "paddw  16(%0), %%mm4       \n\t"\
-        "movq    4(%0), %%mm2       \n\t"\
-        "movq   12(%0), %%mm5       \n\t"\
-        "paddw   6(%0), %%mm2       \n\t"\
-        "paddw  14(%0), %%mm5       \n\t"\
-        "psubw %%mm1, %%mm0         \n\t"\
-        "psubw %%mm4, %%mm3         \n\t"\
-        "psraw $2, %%mm0            \n\t"\
-        "psraw $2, %%mm3            \n\t"\
-        "psubw %%mm1, %%mm0         \n\t"\
-        "psubw %%mm4, %%mm3         \n\t"\
-        "paddsw %%mm2, %%mm0        \n\t"\
-        "paddsw %%mm5, %%mm3        \n\t"\
-        "psraw $2, %%mm0            \n\t"\
-        "psraw $2, %%mm3            \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"\
-        "paddw %%mm5, %%mm3         \n\t"\
-        "psraw $6, %%mm0            \n\t"\
-        "psraw $6, %%mm3            \n\t"\
-        "packuswb %%mm3, %%mm0      \n\t"\
-        OP(%%mm0, (%1),%%mm7, q)\
-        "add $48, %0                \n\t"\
-        "add %3, %1                 \n\t"\
-        "decl %2                    \n\t"\
-        " jnz 1b                    \n\t"\
-        : "+a"(tmp), "+c"(dst), "+g"(h)\
-        : "S"((x86_reg)dstStride)\
-        : "memory"\
-    );\
-    tmp += 8 - size*24;\
-    dst += 8 - size*dstStride;\
-    }while(w--);\
-}\
-\
-static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
-}\
-static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
-    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}\
-\
-static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-    src += 8*dstStride;\
-    dst += 8*dstStride;\
-    src2 += 8*src2Stride;\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
-          put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
-    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
-}\
-static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
-}\
-\
-static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
-}\
-\
-static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
-{\
-    __asm__ volatile(\
-        "movq      (%1), %%mm0          \n\t"\
-        "movq    24(%1), %%mm1          \n\t"\
-        "psraw      $5,  %%mm0          \n\t"\
-        "psraw      $5,  %%mm1          \n\t"\
-        "packuswb %%mm0, %%mm0          \n\t"\
-        "packuswb %%mm1, %%mm1          \n\t"\
-        PAVGB"     (%0), %%mm0          \n\t"\
-        PAVGB"  (%0,%3), %%mm1          \n\t"\
-        OP(%%mm0, (%2),    %%mm4, d)\
-        OP(%%mm1, (%2,%4), %%mm5, d)\
-        "lea  (%0,%3,2), %0             \n\t"\
-        "lea  (%2,%4,2), %2             \n\t"\
-        "movq    48(%1), %%mm0          \n\t"\
-        "movq    72(%1), %%mm1          \n\t"\
-        "psraw      $5,  %%mm0          \n\t"\
-        "psraw      $5,  %%mm1          \n\t"\
-        "packuswb %%mm0, %%mm0          \n\t"\
-        "packuswb %%mm1, %%mm1          \n\t"\
-        PAVGB"     (%0), %%mm0          \n\t"\
-        PAVGB"  (%0,%3), %%mm1          \n\t"\
-        OP(%%mm0, (%2),    %%mm4, d)\
-        OP(%%mm1, (%2,%4), %%mm5, d)\
-        :"+a"(src8), "+c"(src16), "+d"(dst)\
-        :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
-        :"memory");\
-}\
-static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
-{\
-    do{\
-    __asm__ volatile(\
-        "movq      (%1), %%mm0          \n\t"\
-        "movq     8(%1), %%mm1          \n\t"\
-        "movq    48(%1), %%mm2          \n\t"\
-        "movq  8+48(%1), %%mm3          \n\t"\
-        "psraw      $5,  %%mm0          \n\t"\
-        "psraw      $5,  %%mm1          \n\t"\
-        "psraw      $5,  %%mm2          \n\t"\
-        "psraw      $5,  %%mm3          \n\t"\
-        "packuswb %%mm1, %%mm0          \n\t"\
-        "packuswb %%mm3, %%mm2          \n\t"\
-        PAVGB"     (%0), %%mm0          \n\t"\
-        PAVGB"  (%0,%3), %%mm2          \n\t"\
-        OP(%%mm0, (%2), %%mm5, q)\
-        OP(%%mm2, (%2,%4), %%mm5, q)\
-        ::"a"(src8), "c"(src16), "d"(dst),\
-          "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
-        :"memory");\
-        src8 += 2L*src8Stride;\
-        src16 += 48;\
-        dst += 2L*dstStride;\
-    }while(h-=2);\
-}\
-static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
-{\
-    OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
-    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
-}\
-
-
-#ifdef ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    int h=16;\
-    __asm__ volatile(\
-        "pxor %%xmm15, %%xmm15      \n\t"\
-        "movdqa %6, %%xmm14         \n\t"\
-        "movdqa %7, %%xmm13         \n\t"\
-        "1:                         \n\t"\
-        "lddqu    3(%0), %%xmm1     \n\t"\
-        "lddqu   -5(%0), %%xmm7     \n\t"\
-        "movdqa  %%xmm1, %%xmm0     \n\t"\
-        "punpckhbw %%xmm15, %%xmm1  \n\t"\
-        "punpcklbw %%xmm15, %%xmm0  \n\t"\
-        "punpcklbw %%xmm15, %%xmm7  \n\t"\
-        "movdqa  %%xmm1, %%xmm2     \n\t"\
-        "movdqa  %%xmm0, %%xmm6     \n\t"\
-        "movdqa  %%xmm1, %%xmm3     \n\t"\
-        "movdqa  %%xmm0, %%xmm8     \n\t"\
-        "movdqa  %%xmm1, %%xmm4     \n\t"\
-        "movdqa  %%xmm0, %%xmm9     \n\t"\
-        "movdqa  %%xmm1, %%xmm5     \n\t"\
-        "movdqa  %%xmm0, %%xmm10    \n\t"\
-        "palignr $6, %%xmm0, %%xmm5 \n\t"\
-        "palignr $6, %%xmm7, %%xmm10\n\t"\
-        "palignr $8, %%xmm0, %%xmm4 \n\t"\
-        "palignr $8, %%xmm7, %%xmm9 \n\t"\
-        "palignr $10,%%xmm0, %%xmm3 \n\t"\
-        "palignr $10,%%xmm7, %%xmm8 \n\t"\
-        "paddw   %%xmm1, %%xmm5     \n\t"\
-        "paddw   %%xmm0, %%xmm10    \n\t"\
-        "palignr $12,%%xmm0, %%xmm2 \n\t"\
-        "palignr $12,%%xmm7, %%xmm6 \n\t"\
-        "palignr $14,%%xmm0, %%xmm1 \n\t"\
-        "palignr $14,%%xmm7, %%xmm0 \n\t"\
-        "paddw   %%xmm3, %%xmm2     \n\t"\
-        "paddw   %%xmm8, %%xmm6     \n\t"\
-        "paddw   %%xmm4, %%xmm1     \n\t"\
-        "paddw   %%xmm9, %%xmm0     \n\t"\
-        "psllw   $2,     %%xmm2     \n\t"\
-        "psllw   $2,     %%xmm6     \n\t"\
-        "psubw   %%xmm1, %%xmm2     \n\t"\
-        "psubw   %%xmm0, %%xmm6     \n\t"\
-        "paddw   %%xmm13,%%xmm5     \n\t"\
-        "paddw   %%xmm13,%%xmm10    \n\t"\
-        "pmullw  %%xmm14,%%xmm2     \n\t"\
-        "pmullw  %%xmm14,%%xmm6     \n\t"\
-        "lddqu   (%2),   %%xmm3     \n\t"\
-        "paddw   %%xmm5, %%xmm2     \n\t"\
-        "paddw   %%xmm10,%%xmm6     \n\t"\
-        "psraw   $5,     %%xmm2     \n\t"\
-        "psraw   $5,     %%xmm6     \n\t"\
-        "packuswb %%xmm2,%%xmm6     \n\t"\
-        "pavgb   %%xmm3, %%xmm6     \n\t"\
-        OP(%%xmm6, (%1), %%xmm4, dqa)\
-        "add %5, %0                 \n\t"\
-        "add %5, %1                 \n\t"\
-        "add %4, %2                 \n\t"\
-        "decl %3                    \n\t"\
-        "jg 1b                      \n\t"\
-        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
-        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
-          "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
-    );\
-}
-#else // ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-    src += 8*dstStride;\
-    dst += 8*dstStride;\
-    src2 += 8*src2Stride;\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
-    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-}
-#endif // ARCH_X86_64
-
-#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
-    int h=8;\
-    __asm__ volatile(\
-        "pxor %%xmm7, %%xmm7        \n\t"\
-        "movdqa %0, %%xmm6          \n\t"\
-        :: "m"(ff_pw_5)\
-    );\
-    do{\
-    __asm__ volatile(\
-        "lddqu   -5(%0), %%xmm1     \n\t"\
-        "movdqa  %%xmm1, %%xmm0     \n\t"\
-        "punpckhbw %%xmm7, %%xmm1   \n\t"\
-        "punpcklbw %%xmm7, %%xmm0   \n\t"\
-        "movdqa  %%xmm1, %%xmm2     \n\t"\
-        "movdqa  %%xmm1, %%xmm3     \n\t"\
-        "movdqa  %%xmm1, %%xmm4     \n\t"\
-        "movdqa  %%xmm1, %%xmm5     \n\t"\
-        "palignr $6, %%xmm0, %%xmm5 \n\t"\
-        "palignr $8, %%xmm0, %%xmm4 \n\t"\
-        "palignr $10,%%xmm0, %%xmm3 \n\t"\
-        "paddw   %%xmm1, %%xmm5     \n\t"\
-        "palignr $12,%%xmm0, %%xmm2 \n\t"\
-        "palignr $14,%%xmm0, %%xmm1 \n\t"\
-        "paddw   %%xmm3, %%xmm2     \n\t"\
-        "paddw   %%xmm4, %%xmm1     \n\t"\
-        "psllw   $2,     %%xmm2     \n\t"\
-        "movq    (%2),   %%xmm3     \n\t"\
-        "psubw   %%xmm1, %%xmm2     \n\t"\
-        "paddw   %5,     %%xmm5     \n\t"\
-        "pmullw  %%xmm6, %%xmm2     \n\t"\
-        "paddw   %%xmm5, %%xmm2     \n\t"\
-        "psraw   $5,     %%xmm2     \n\t"\
-        "packuswb %%xmm2, %%xmm2    \n\t"\
-        "pavgb   %%xmm3, %%xmm2     \n\t"\
-        OP(%%xmm2, (%1), %%xmm4, q)\
-        "add %4, %0                 \n\t"\
-        "add %4, %1                 \n\t"\
-        "add %3, %2                 \n\t"\
-        : "+a"(src), "+c"(dst), "+d"(src2)\
-        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
-          "m"(ff_pw_16)\
-        : "memory"\
-    );\
-    }while(--h);\
-}\
-QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    int h=8;\
-    __asm__ volatile(\
-        "pxor %%xmm7, %%xmm7        \n\t"\
-        "movdqa %5, %%xmm6          \n\t"\
-        "1:                         \n\t"\
-        "lddqu   -5(%0), %%xmm1     \n\t"\
-        "movdqa  %%xmm1, %%xmm0     \n\t"\
-        "punpckhbw %%xmm7, %%xmm1   \n\t"\
-        "punpcklbw %%xmm7, %%xmm0   \n\t"\
-        "movdqa  %%xmm1, %%xmm2     \n\t"\
-        "movdqa  %%xmm1, %%xmm3     \n\t"\
-        "movdqa  %%xmm1, %%xmm4     \n\t"\
-        "movdqa  %%xmm1, %%xmm5     \n\t"\
-        "palignr $6, %%xmm0, %%xmm5 \n\t"\
-        "palignr $8, %%xmm0, %%xmm4 \n\t"\
-        "palignr $10,%%xmm0, %%xmm3 \n\t"\
-        "paddw   %%xmm1, %%xmm5     \n\t"\
-        "palignr $12,%%xmm0, %%xmm2 \n\t"\
-        "palignr $14,%%xmm0, %%xmm1 \n\t"\
-        "paddw   %%xmm3, %%xmm2     \n\t"\
-        "paddw   %%xmm4, %%xmm1     \n\t"\
-        "psllw   $2,     %%xmm2     \n\t"\
-        "psubw   %%xmm1, %%xmm2     \n\t"\
-        "paddw   %6,     %%xmm5     \n\t"\
-        "pmullw  %%xmm6, %%xmm2     \n\t"\
-        "paddw   %%xmm5, %%xmm2     \n\t"\
-        "psraw   $5,     %%xmm2     \n\t"\
-        "packuswb %%xmm2, %%xmm2    \n\t"\
-        OP(%%xmm2, (%1), %%xmm4, q)\
-        "add %3, %0                 \n\t"\
-        "add %4, %1                 \n\t"\
-        "decl %2                    \n\t"\
-        " jnz 1b                    \n\t"\
-        : "+a"(src), "+c"(dst), "+g"(h)\
-        : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
-          "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
-    );\
-}\
-static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
-    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-}\
-
-#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
-    src -= 2*srcStride;\
-    \
-    __asm__ volatile(\
-        "pxor %%xmm7, %%xmm7        \n\t"\
-        "movq (%0), %%xmm0          \n\t"\
-        "add %2, %0                 \n\t"\
-        "movq (%0), %%xmm1          \n\t"\
-        "add %2, %0                 \n\t"\
-        "movq (%0), %%xmm2          \n\t"\
-        "add %2, %0                 \n\t"\
-        "movq (%0), %%xmm3          \n\t"\
-        "add %2, %0                 \n\t"\
-        "movq (%0), %%xmm4          \n\t"\
-        "add %2, %0                 \n\t"\
-        "punpcklbw %%xmm7, %%xmm0   \n\t"\
-        "punpcklbw %%xmm7, %%xmm1   \n\t"\
-        "punpcklbw %%xmm7, %%xmm2   \n\t"\
-        "punpcklbw %%xmm7, %%xmm3   \n\t"\
-        "punpcklbw %%xmm7, %%xmm4   \n\t"\
-        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
-        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
-        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
-        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
-        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
-        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
-        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
-        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
-         \
-        : "+a"(src), "+c"(dst)\
-        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-        : "memory"\
-    );\
-    if(h==16){\
-        __asm__ volatile(\
-            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
-            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
-            QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
-            QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
-            QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
-            QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
-            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
-            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
-            \
-            : "+a"(src), "+c"(dst)\
-            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
-            : "memory"\
-        );\
-    }\
-}\
-static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
-}\
-static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
-    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}
-
-static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
-    int w = (size+8)>>3;
-    src -= 2*srcStride+2;
-    while(w--){
-        __asm__ volatile(
-            "pxor %%xmm7, %%xmm7        \n\t"
-            "movq (%0), %%xmm0          \n\t"
-            "add %2, %0                 \n\t"
-            "movq (%0), %%xmm1          \n\t"
-            "add %2, %0                 \n\t"
-            "movq (%0), %%xmm2          \n\t"
-            "add %2, %0                 \n\t"
-            "movq (%0), %%xmm3          \n\t"
-            "add %2, %0                 \n\t"
-            "movq (%0), %%xmm4          \n\t"
-            "add %2, %0                 \n\t"
-            "punpcklbw %%xmm7, %%xmm0   \n\t"
-            "punpcklbw %%xmm7, %%xmm1   \n\t"
-            "punpcklbw %%xmm7, %%xmm2   \n\t"
-            "punpcklbw %%xmm7, %%xmm3   \n\t"
-            "punpcklbw %%xmm7, %%xmm4   \n\t"
-            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
-            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
-            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
-            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
-            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
-            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
-            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
-            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
-            : "+a"(src)
-            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
-            : "memory"
-        );
-        if(size==16){
-            __asm__ volatile(
-                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
-                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
-                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
-                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
-                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
-                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
-                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
-                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
-                : "+a"(src)
-                : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
-                : "memory"
-            );
-        }
-        tmp += 8;
-        src += 8 - (size+5)*srcStride;
-    }
-}
-
-#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
-static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
-    int h = size;\
-    if(size == 16){\
-        __asm__ volatile(\
-            "1:                         \n\t"\
-            "movdqa 32(%0), %%xmm4      \n\t"\
-            "movdqa 16(%0), %%xmm5      \n\t"\
-            "movdqa   (%0), %%xmm7      \n\t"\
-            "movdqa %%xmm4, %%xmm3      \n\t"\
-            "movdqa %%xmm4, %%xmm2      \n\t"\
-            "movdqa %%xmm4, %%xmm1      \n\t"\
-            "movdqa %%xmm4, %%xmm0      \n\t"\
-            "palignr $10, %%xmm5, %%xmm0 \n\t"\
-            "palignr  $8, %%xmm5, %%xmm1 \n\t"\
-            "palignr  $6, %%xmm5, %%xmm2 \n\t"\
-            "palignr  $4, %%xmm5, %%xmm3 \n\t"\
-            "palignr  $2, %%xmm5, %%xmm4 \n\t"\
-            "paddw  %%xmm5, %%xmm0      \n\t"\
-            "paddw  %%xmm4, %%xmm1      \n\t"\
-            "paddw  %%xmm3, %%xmm2      \n\t"\
-            "movdqa %%xmm5, %%xmm6      \n\t"\
-            "movdqa %%xmm5, %%xmm4      \n\t"\
-            "movdqa %%xmm5, %%xmm3      \n\t"\
-            "palignr  $8, %%xmm7, %%xmm4 \n\t"\
-            "palignr  $2, %%xmm7, %%xmm6 \n\t"\
-            "palignr $10, %%xmm7, %%xmm3 \n\t"\
-            "paddw  %%xmm6, %%xmm4      \n\t"\
-            "movdqa %%xmm5, %%xmm6      \n\t"\
-            "palignr  $6, %%xmm7, %%xmm5 \n\t"\
-            "palignr  $4, %%xmm7, %%xmm6 \n\t"\
-            "paddw  %%xmm7, %%xmm3      \n\t"\
-            "paddw  %%xmm6, %%xmm5      \n\t"\
-            \
-            "psubw  %%xmm1, %%xmm0      \n\t"\
-            "psubw  %%xmm4, %%xmm3      \n\t"\
-            "psraw      $2, %%xmm0      \n\t"\
-            "psraw      $2, %%xmm3      \n\t"\
-            "psubw  %%xmm1, %%xmm0      \n\t"\
-            "psubw  %%xmm4, %%xmm3      \n\t"\
-            "paddw  %%xmm2, %%xmm0      \n\t"\
-            "paddw  %%xmm5, %%xmm3      \n\t"\
-            "psraw      $2, %%xmm0      \n\t"\
-            "psraw      $2, %%xmm3      \n\t"\
-            "paddw  %%xmm2, %%xmm0      \n\t"\
-            "paddw  %%xmm5, %%xmm3      \n\t"\
-            "psraw      $6, %%xmm0      \n\t"\
-            "psraw      $6, %%xmm3      \n\t"\
-            "packuswb %%xmm0, %%xmm3    \n\t"\
-            OP(%%xmm3, (%1), %%xmm7, dqa)\
-            "add $48, %0                \n\t"\
-            "add %3, %1                 \n\t"\
-            "decl %2                    \n\t"\
-            " jnz 1b                    \n\t"\
-            : "+a"(tmp), "+c"(dst), "+g"(h)\
-            : "S"((x86_reg)dstStride)\
-            : "memory"\
-        );\
-    }else{\
-        __asm__ volatile(\
-            "1:                         \n\t"\
-            "movdqa 16(%0), %%xmm1      \n\t"\
-            "movdqa   (%0), %%xmm0      \n\t"\
-            "movdqa %%xmm1, %%xmm2      \n\t"\
-            "movdqa %%xmm1, %%xmm3      \n\t"\
-            "movdqa %%xmm1, %%xmm4      \n\t"\
-            "movdqa %%xmm1, %%xmm5      \n\t"\
-            "palignr $10, %%xmm0, %%xmm5 \n\t"\
-            "palignr  $8, %%xmm0, %%xmm4 \n\t"\
-            "palignr  $6, %%xmm0, %%xmm3 \n\t"\
-            "palignr  $4, %%xmm0, %%xmm2 \n\t"\
-            "palignr  $2, %%xmm0, %%xmm1 \n\t"\
-            "paddw  %%xmm5, %%xmm0      \n\t"\
-            "paddw  %%xmm4, %%xmm1      \n\t"\
-            "paddw  %%xmm3, %%xmm2      \n\t"\
-            "psubw  %%xmm1, %%xmm0      \n\t"\
-            "psraw      $2, %%xmm0      \n\t"\
-            "psubw  %%xmm1, %%xmm0      \n\t"\
-            "paddw  %%xmm2, %%xmm0      \n\t"\
-            "psraw      $2, %%xmm0      \n\t"\
-            "paddw  %%xmm2, %%xmm0      \n\t"\
-            "psraw      $6, %%xmm0      \n\t"\
-            "packuswb %%xmm0, %%xmm0    \n\t"\
-            OP(%%xmm0, (%1), %%xmm7, q)\
-            "add $48, %0                \n\t"\
-            "add %3, %1                 \n\t"\
-            "decl %2                    \n\t"\
-            " jnz 1b                    \n\t"\
-            : "+a"(tmp), "+c"(dst), "+g"(h)\
-            : "S"((x86_reg)dstStride)\
-            : "memory"\
-        );\
-    }\
-}
-
-#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
-          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
-    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
-}\
-static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
-}\
-static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
-    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
-}\
-
-#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
-#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
-#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
-#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
-#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
-#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
-#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
-#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
-
-#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
-#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
-#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
-#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
-#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
-#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
-#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
-#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
-
-#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
-#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
-#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
-#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
-
-#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
-#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
-#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
-#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
-
-#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
-#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
-
-#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
-H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
-
-static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
-    put_pixels16_sse2(dst, src, stride, 16);
-}
-static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
-    avg_pixels16_sse2(dst, src, stride, 16);
-}
-#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
-#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
-
-#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
-}\
-
-#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
-}\
-
-#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
-}\
-
-#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\
-    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
-    uint8_t * const halfHV= temp;\
-    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
-    uint8_t * const halfHV= temp;\
-    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
-    uint8_t * const halfHV= temp;\
-    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
-    uint8_t * const halfHV= temp;\
-    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
-}\
-
-#define H264_MC_4816(MMX)\
-H264_MC(put_, 4, MMX, 8)\
-H264_MC(put_, 8, MMX, 8)\
-H264_MC(put_, 16,MMX, 8)\
-H264_MC(avg_, 4, MMX, 8)\
-H264_MC(avg_, 8, MMX, 8)\
-H264_MC(avg_, 16,MMX, 8)\
-
-#define H264_MC_816(QPEL, XMM)\
-QPEL(put_, 8, XMM, 16)\
-QPEL(put_, 16,XMM, 16)\
-QPEL(avg_, 8, XMM, 16)\
-QPEL(avg_, 16,XMM, 16)\
-
-
-#define AVG_3DNOW_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "   \n\t"\
-"pavgusb " #temp ", " #a "        \n\t"\
-"mov" #size " " #a ", " #b "      \n\t"
-#define AVG_MMX2_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp "   \n\t"\
-"pavgb " #temp ", " #a "          \n\t"\
-"mov" #size " " #a ", " #b "      \n\t"
-
-#define PAVGB "pavgusb"
-QPEL_H264(put_,       PUT_OP, 3dnow)
-QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
-#undef PAVGB
-#define PAVGB "pavgb"
-QPEL_H264(put_,       PUT_OP, mmx2)
-QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
-QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
-QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
-QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
-QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
-#ifdef HAVE_SSSE3
-QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
-QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
-QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
-QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
-QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
-QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
-#endif
-#undef PAVGB
-
-H264_MC_4816(3dnow)
-H264_MC_4816(mmx2)
-H264_MC_816(H264_MC_V, sse2)
-H264_MC_816(H264_MC_HV, sse2)
-#ifdef HAVE_SSSE3
-H264_MC_816(H264_MC_H, ssse3)
-H264_MC_816(H264_MC_HV, ssse3)
-#endif
-
-
-#define H264_CHROMA_OP(S,D)
-#define H264_CHROMA_OP4(S,D,T)
-#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
-#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
-#define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
-#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
-#include "dsputil_h264_template_mmx.c"
-
-static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 1);
-}
-static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 0);
-}
-
-#undef H264_CHROMA_OP
-#undef H264_CHROMA_OP4
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC2_TMPL
-#undef H264_CHROMA_MC8_MV0
-
-#define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
-#define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
-                               "pavgb " #T ", " #D " \n\t"
-#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
-#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
-#define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
-#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
-#include "dsputil_h264_template_mmx.c"
-static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    avg_h264_chroma_mc8_mmx2(dst, src, stride, h, x, y, 1);
-}
-#undef H264_CHROMA_OP
-#undef H264_CHROMA_OP4
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC2_TMPL
-#undef H264_CHROMA_MC8_MV0
-
-#define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
-#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
-                               "pavgusb " #T ", " #D " \n\t"
-#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow
-#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow
-#define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
-#include "dsputil_h264_template_mmx.c"
-static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    avg_h264_chroma_mc8_3dnow(dst, src, stride, h, x, y, 1);
-}
-#undef H264_CHROMA_OP
-#undef H264_CHROMA_OP4
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC8_MV0
-
-#ifdef HAVE_SSSE3
-#define AVG_OP(X)
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
-#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
-#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
-#include "dsputil_h264_template_ssse3.c"
-static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
-}
-static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
-}
-
-#undef AVG_OP
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC8_MV0
-#define AVG_OP(X) X
-#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
-#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
-#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
-#include "dsputil_h264_template_ssse3.c"
-static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
-    avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
-}
-#undef AVG_OP
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC8_MV0
-#endif
-
-/***********************************/
-/* weighted prediction */
-
-static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
-{
-    int x, y;
-    offset <<= log2_denom;
-    offset += (1 << log2_denom) >> 1;
-    __asm__ volatile(
-        "movd    %0, %%mm4        \n\t"
-        "movd    %1, %%mm5        \n\t"
-        "movd    %2, %%mm6        \n\t"
-        "pshufw  $0, %%mm4, %%mm4 \n\t"
-        "pshufw  $0, %%mm5, %%mm5 \n\t"
-        "pxor    %%mm7, %%mm7     \n\t"
-        :: "g"(weight), "g"(offset), "g"(log2_denom)
-    );
-    for(y=0; y<h; y+=2){
-        for(x=0; x<w; x+=4){
-            __asm__ volatile(
-                "movd      %0,    %%mm0 \n\t"
-                "movd      %1,    %%mm1 \n\t"
-                "punpcklbw %%mm7, %%mm0 \n\t"
-                "punpcklbw %%mm7, %%mm1 \n\t"
-                "pmullw    %%mm4, %%mm0 \n\t"
-                "pmullw    %%mm4, %%mm1 \n\t"
-                "paddsw    %%mm5, %%mm0 \n\t"
-                "paddsw    %%mm5, %%mm1 \n\t"
-                "psraw     %%mm6, %%mm0 \n\t"
-                "psraw     %%mm6, %%mm1 \n\t"
-                "packuswb  %%mm7, %%mm0 \n\t"
-                "packuswb  %%mm7, %%mm1 \n\t"
-                "movd      %%mm0, %0    \n\t"
-                "movd      %%mm1, %1    \n\t"
-                : "+m"(*(uint32_t*)(dst+x)),
-                  "+m"(*(uint32_t*)(dst+x+stride))
-            );
-        }
-        dst += 2*stride;
-    }
-}
-
-static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
-{
-    int x, y;
-    offset = ((offset + 1) | 1) << log2_denom;
-    __asm__ volatile(
-        "movd    %0, %%mm3        \n\t"
-        "movd    %1, %%mm4        \n\t"
-        "movd    %2, %%mm5        \n\t"
-        "movd    %3, %%mm6        \n\t"
-        "pshufw  $0, %%mm3, %%mm3 \n\t"
-        "pshufw  $0, %%mm4, %%mm4 \n\t"
-        "pshufw  $0, %%mm5, %%mm5 \n\t"
-        "pxor    %%mm7, %%mm7     \n\t"
-        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
-    );
-    for(y=0; y<h; y++){
-        for(x=0; x<w; x+=4){
-            __asm__ volatile(
-                "movd      %0,    %%mm0 \n\t"
-                "movd      %1,    %%mm1 \n\t"
-                "punpcklbw %%mm7, %%mm0 \n\t"
-                "punpcklbw %%mm7, %%mm1 \n\t"
-                "pmullw    %%mm3, %%mm0 \n\t"
-                "pmullw    %%mm4, %%mm1 \n\t"
-                "paddsw    %%mm1, %%mm0 \n\t"
-                "paddsw    %%mm5, %%mm0 \n\t"
-                "psraw     %%mm6, %%mm0 \n\t"
-                "packuswb  %%mm0, %%mm0 \n\t"
-                "movd      %%mm0, %0    \n\t"
-                : "+m"(*(uint32_t*)(dst+x))
-                :  "m"(*(uint32_t*)(src+x))
-            );
-        }
-        src += stride;
-        dst += stride;
-    }
-}
-
-#define H264_WEIGHT(W,H) \
-static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
-    ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
-} \
-static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
-    ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
-}
-
-H264_WEIGHT(16,16)
-H264_WEIGHT(16, 8)
-H264_WEIGHT( 8,16)
-H264_WEIGHT( 8, 8)
-H264_WEIGHT( 8, 4)
-H264_WEIGHT( 4, 8)
-H264_WEIGHT( 4, 4)
-H264_WEIGHT( 4, 2)
-
diff --git a/libavcodec/i386/idct_sse2_xvid.c b/libavcodec/i386/idct_sse2_xvid.c
deleted file mode 100644
index be4f211..0000000
--- a/libavcodec/i386/idct_sse2_xvid.c
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - SSE2 inverse discrete cosine transform -
- *
- * Copyright(C) 2003 Pascal Massimino <skal at planet-d.net>
- *
- * Conversion to gcc syntax with modifications
- * by Alexander Strange <astrange at ithinksw.com>
- *
- * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
- *
- * This file is part of FFmpeg.
- *
- * Vertical pass is an implementation of the scheme:
- *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
- *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
- *  Proc. ICASSP 1989, 988-991.
- *
- * Horizontal pass is a double 4x4 vector/matrix multiplication,
- * (see also Intel's Application Note 922:
- *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- *  Copyright (C) 1999 Intel Corporation)
- *
- * More details at http://skal.planet-d.net/coding/dct.html
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with FFmpeg; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/dsputil.h"
-#include "libavcodec/i386/idct_xvid.h"
-
-/*!
- * @file idct_sse2_xvid.c
- * @brief SSE2 idct compatible with xvidmmx
- */
-
-#define X8(x)     x,x,x,x,x,x,x,x
-
-#define ROW_SHIFT 11
-#define COL_SHIFT 6
-
-DECLARE_ASM_CONST(16, int16_t, tan1[]) = {X8(13036)}; // tan( pi/16)
-DECLARE_ASM_CONST(16, int16_t, tan2[]) = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
-DECLARE_ASM_CONST(16, int16_t, tan3[]) = {X8(43790)}; // tan(3pi/16)-1
-DECLARE_ASM_CONST(16, int16_t, sqrt2[])= {X8(23170)}; // 0.5/sqrt(2)
-DECLARE_ASM_CONST(8,  uint8_t, m127[]) = {X8(127)};
-
-DECLARE_ASM_CONST(16, int16_t, iTab1[]) = {
- 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
- 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
- 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
- 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab2[]) = {
- 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
- 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
- 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
- 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab3[]) = {
- 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
- 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
- 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
- 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab4[]) = {
- 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
- 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
- 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
- 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
-};
-
-DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders[]) = {
- 65536, 65536, 65536, 65536,
-  3597,  3597,  3597,  3597,
-  2260,  2260,  2260,  2260,
-  1203,  1203,  1203,  1203,
-   120,   120,   120,   120,
-   512,   512,   512,   512
-};
-
-// Temporary storage before the column pass
-#define ROW1 "%%xmm6"
-#define ROW3 "%%xmm4"
-#define ROW5 "%%xmm5"
-#define ROW7 "%%xmm7"
-
-#define CLEAR_ODD(r) "pxor  "r","r" \n\t"
-#define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
-
-#ifdef ARCH_X86_64
-
-# define ROW0 "%%xmm8"
-# define REG0 ROW0
-# define ROW2 "%%xmm9"
-# define REG2 ROW2
-# define ROW4 "%%xmm10"
-# define REG4 ROW4
-# define ROW6 "%%xmm11"
-# define REG6 ROW6
-# define CLEAR_EVEN(r) CLEAR_ODD(r)
-# define PUT_EVEN(dst) PUT_ODD(dst)
-# define XMMS "%%xmm12"
-# define MOV_32_ONLY "#"
-# define SREG2 REG2
-# define TAN3 "%%xmm13"
-# define TAN1 "%%xmm14"
-
-#else
-
-# define ROW0 "(%0)"
-# define REG0 "%%xmm4"
-# define ROW2 "2*16(%0)"
-# define REG2 "%%xmm4"
-# define ROW4 "4*16(%0)"
-# define REG4 "%%xmm6"
-# define ROW6 "6*16(%0)"
-# define REG6 "%%xmm6"
-# define CLEAR_EVEN(r)
-# define PUT_EVEN(dst) \
-    "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
-    "movdqa          %%xmm2, "dst"    \n\t"
-# define XMMS "%%xmm2"
-# define MOV_32_ONLY "movdqa "
-# define SREG2 "%%xmm7"
-# define TAN3 "%%xmm0"
-# define TAN1 "%%xmm2"
-
-#endif
-
-#define ROUND(x) "paddd   "MANGLE(x)
-
-#define JZ(reg, to)                         \
-    "testl     "reg","reg"            \n\t" \
-    "jz        "to"                   \n\t"
-
-#define JNZ(reg, to)                        \
-    "testl     "reg","reg"            \n\t" \
-    "jnz       "to"                   \n\t"
-
-#define TEST_ONE_ROW(src, reg, clear)       \
-    clear                                   \
-    "movq     "src", %%mm1            \n\t" \
-    "por    8+"src", %%mm1            \n\t" \
-    "paddusb  %%mm0, %%mm1            \n\t" \
-    "pmovmskb %%mm1, "reg"            \n\t"
-
-#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
-    clear1                                  \
-    clear2                                  \
-    "movq     "row1", %%mm1           \n\t" \
-    "por    8+"row1", %%mm1           \n\t" \
-    "movq     "row2", %%mm2           \n\t" \
-    "por    8+"row2", %%mm2           \n\t" \
-    "paddusb   %%mm0, %%mm1           \n\t" \
-    "paddusb   %%mm0, %%mm2           \n\t" \
-    "pmovmskb  %%mm1, "reg1"          \n\t" \
-    "pmovmskb  %%mm2, "reg2"          \n\t"
-
-///IDCT pass on rows.
-#define iMTX_MULT(src, table, rounder, put) \
-    "movdqa        "src", %%xmm3      \n\t" \
-    "movdqa       %%xmm3, %%xmm0      \n\t" \
-    "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
-    "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
-    "pmaddwd     "table", %%xmm0      \n\t" \
-    "pmaddwd  16+"table", %%xmm1      \n\t" \
-    "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
-    "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
-    "pmaddwd  32+"table", %%xmm2      \n\t" \
-    "pmaddwd  48+"table", %%xmm3      \n\t" \
-    "paddd        %%xmm1, %%xmm0      \n\t" \
-    "paddd        %%xmm3, %%xmm2      \n\t" \
-    rounder",     %%xmm0              \n\t" \
-    "movdqa       %%xmm2, %%xmm3      \n\t" \
-    "paddd        %%xmm0, %%xmm2      \n\t" \
-    "psubd        %%xmm3, %%xmm0      \n\t" \
-    "psrad           $11, %%xmm2      \n\t" \
-    "psrad           $11, %%xmm0      \n\t" \
-    "packssdw     %%xmm0, %%xmm2      \n\t" \
-    put                                     \
-    "1:                               \n\t"
-
-#define iLLM_HEAD                           \
-    "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
-    "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
-
-///IDCT pass on columns.
-#define iLLM_PASS(dct)                      \
-    "movdqa   "TAN3", %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "pmulhw   %%xmm5, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm5, %%xmm1          \n\t" \
-    "psubsw   %%xmm5, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, %%xmm1          \n\t" \
-    "pmulhw   %%xmm7, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "paddsw   %%xmm6, %%xmm3          \n\t" \
-    "psubsw   %%xmm7, "TAN1"          \n\t" \
-    "movdqa   %%xmm3, %%xmm7          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm7, %%xmm1          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
-    MOV_32_ONLY ROW2", "REG2"         \n\t" \
-    MOV_32_ONLY ROW6", "REG6"         \n\t" \
-    "movdqa   %%xmm7, %%xmm5          \n\t" \
-    "pmulhw   "REG6", %%xmm7          \n\t" \
-    "pmulhw   "REG2", %%xmm5          \n\t" \
-    "paddsw   "REG2", %%xmm7          \n\t" \
-    "psubsw   "REG6", %%xmm5          \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    MOV_32_ONLY ROW4", "REG4"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   "REG4", "REG0"          \n\t" \
-    "paddsw   "XMMS", "REG4"          \n\t" \
-    "movdqa   "REG4", "XMMS"          \n\t" \
-    "psubsw   %%xmm7, "REG4"          \n\t" \
-    "paddsw   "XMMS", %%xmm7          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   %%xmm7, %%xmm0          \n\t" \
-    "movdqa   "REG4", %%xmm4          \n\t" \
-    "psubsw   %%xmm1, %%xmm7          \n\t" \
-    "psubsw   "TAN1", "REG4"          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, %%xmm7          \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, "REG4"          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   "REG4", 4*16("dct")     \n\t" \
-    "movdqa   %%xmm7, 7*16("dct")     \n\t"
-
-///IDCT pass on columns, assuming rows 4-7 are zero.
-#define iLLM_PASS_SPARSE(dct)               \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "movdqa   %%xmm6, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "movdqa   %%xmm4, %%xmm1          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "paddsw   %%xmm6, %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
-    MOV_32_ONLY ROW2", "SREG2"        \n\t" \
-    "pmulhw   "SREG2", %%xmm5         \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    "movdqa   "REG0", %%xmm6          \n\t" \
-    "psubsw   "SREG2", %%xmm6         \n\t" \
-    "paddsw   "REG0", "SREG2"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   "SREG2", %%xmm0         \n\t" \
-    "movdqa   %%xmm6, %%xmm4          \n\t" \
-    "psubsw   %%xmm1, "SREG2"         \n\t" \
-    "psubsw   "TAN1", %%xmm6          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, "SREG2"         \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm6          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   %%xmm6, 4*16("dct")     \n\t" \
-    "movdqa   "SREG2", 7*16("dct")    \n\t"
-
-inline void ff_idct_xvid_sse2(short *block)
-{
-    __asm__ volatile(
-    "movq     "MANGLE(m127)", %%mm0                              \n\t"
-    iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),      PUT_EVEN(ROW0))
-    iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
-    iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
-
-    TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
-    JZ("%%eax", "1f")
-    iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
-
-    TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
-    TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
-    iLLM_HEAD
-    ASMALIGN(4)
-    JNZ("%%ecx", "2f")
-    JNZ("%%eax", "3f")
-    JNZ("%%edx", "4f")
-    JNZ("%%esi", "5f")
-    iLLM_PASS_SPARSE("%0")
-    "jmp 6f                                                      \n\t"
-    "2:                                                          \n\t"
-    iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
-    "3:                                                          \n\t"
-    iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
-    JZ("%%edx", "1f")
-    "4:                                                          \n\t"
-    iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
-    JZ("%%esi", "1f")
-    "5:                                                          \n\t"
-    iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
-#ifndef ARCH_X86_64
-    iLLM_HEAD
-#endif
-    iLLM_PASS("%0")
-    "6:                                                          \n\t"
-    : "+r"(block)
-    :
-    : "%eax", "%ecx", "%edx", "%esi", "memory");
-}
-
-void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
-{
-    ff_idct_xvid_sse2(block);
-    put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
-{
-    ff_idct_xvid_sse2(block);
-    add_pixels_clamped_mmx(block, dest, line_size);
-}
diff --git a/libavcodec/i386/idct_xvid.h b/libavcodec/i386/idct_xvid.h
deleted file mode 100644
index 0bf45d5..0000000
--- a/libavcodec/i386/idct_xvid.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/*!
- * @file idct_xvid.h
- * header for Xvid IDCT functions
- */
-
-#ifndef AVCODEC_I386_IDCT_XVID_H
-#define AVCODEC_I386_IDCT_XVID_H
-
-#include <stdint.h>
-
-void ff_idct_xvid_mmx(short *block);
-void ff_idct_xvid_mmx2(short *block);
-void ff_idct_xvid_sse2(short *block);
-void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block);
-void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block);
-
-#endif /* AVCODEC_I386_IDCT_XVID_H */
diff --git a/libavcodec/i386/mathops.h b/libavcodec/i386/mathops.h
deleted file mode 100644
index 2ae24fc..0000000
--- a/libavcodec/i386/mathops.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * simple math operations
- * Copyright (c) 2006 Michael Niedermayer <michaelni at gmx.at> et al
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_I386_MATHOPS_H
-#define AVCODEC_I386_MATHOPS_H
-
-#ifdef FRAC_BITS
-#   define MULL(ra, rb) \
-        ({ int rt, dummy; __asm__ (\
-            "imull %3               \n\t"\
-            "shrdl %4, %%edx, %%eax \n\t"\
-            : "=a"(rt), "=d"(dummy)\
-            : "a" ((int)ra), "rm" ((int)rb), "i"(FRAC_BITS));\
-         rt; })
-#endif
-
-#define MULH(ra, rb) \
-    ({ int rt, dummy;\
-     __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\
-     rt; })
-
-#define MUL64(ra, rb) \
-    ({ int64_t rt;\
-     __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\
-     rt; })
-
-#endif /* AVCODEC_I386_MATHOPS_H */
diff --git a/libavcodec/i386/mmx.h b/libavcodec/i386/mmx.h
deleted file mode 100644
index fb5c2d4..0000000
--- a/libavcodec/i386/mmx.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * mmx.h
- * Copyright (C) 1997-2001 H. Dietz and R. Fisher
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef AVCODEC_I386_MMX_H
-#define AVCODEC_I386_MMX_H
-
-#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected.
-
-/*
- * The type of an value that fits in an MMX register (note that long
- * long constant values MUST be suffixed by LL and unsigned long long
- * values by ULL, lest they be truncated by the compiler)
- */
-
-typedef        union {
-        long long               q;      /* Quadword (64-bit) value */
-        unsigned long long      uq;     /* Unsigned Quadword */
-        int                     d[2];   /* 2 Doubleword (32-bit) values */
-        unsigned int            ud[2];  /* 2 Unsigned Doubleword */
-        short                   w[4];   /* 4 Word (16-bit) values */
-        unsigned short          uw[4];  /* 4 Unsigned Word */
-        char                    b[8];   /* 8 Byte (8-bit) values */
-        unsigned char           ub[8];  /* 8 Unsigned Byte */
-        float                   s[2];   /* Single-precision (32-bit) value */
-} mmx_t;        /* On an 8-byte (64-bit) boundary */
-
-
-#define         mmx_i2r(op,imm,reg) \
-        __asm__ volatile (#op " %0, %%" #reg \
-                              : /* nothing */ \
-                              : "i" (imm) )
-
-#define         mmx_m2r(op,mem,reg) \
-        __asm__ volatile (#op " %0, %%" #reg \
-                              : /* nothing */ \
-                              : "m" (mem))
-
-#define         mmx_r2m(op,reg,mem) \
-        __asm__ volatile (#op " %%" #reg ", %0" \
-                              : "=m" (mem) \
-                              : /* nothing */ )
-
-#define         mmx_r2r(op,regs,regd) \
-        __asm__ volatile (#op " %" #regs ", %" #regd)
-
-
-#define         emms() __asm__ volatile ("emms")
-
-#define         movd_m2r(var,reg)           mmx_m2r (movd, var, reg)
-#define         movd_r2m(reg,var)           mmx_r2m (movd, reg, var)
-#define         movd_r2r(regs,regd)         mmx_r2r (movd, regs, regd)
-
-#define         movq_m2r(var,reg)           mmx_m2r (movq, var, reg)
-#define         movq_r2m(reg,var)           mmx_r2m (movq, reg, var)
-#define         movq_r2r(regs,regd)         mmx_r2r (movq, regs, regd)
-
-#define         packssdw_m2r(var,reg)       mmx_m2r (packssdw, var, reg)
-#define         packssdw_r2r(regs,regd)     mmx_r2r (packssdw, regs, regd)
-#define         packsswb_m2r(var,reg)       mmx_m2r (packsswb, var, reg)
-#define         packsswb_r2r(regs,regd)     mmx_r2r (packsswb, regs, regd)
-
-#define         packuswb_m2r(var,reg)       mmx_m2r (packuswb, var, reg)
-#define         packuswb_r2r(regs,regd)     mmx_r2r (packuswb, regs, regd)
-
-#define         paddb_m2r(var,reg)          mmx_m2r (paddb, var, reg)
-#define         paddb_r2r(regs,regd)        mmx_r2r (paddb, regs, regd)
-#define         paddd_m2r(var,reg)          mmx_m2r (paddd, var, reg)
-#define         paddd_r2r(regs,regd)        mmx_r2r (paddd, regs, regd)
-#define         paddw_m2r(var,reg)          mmx_m2r (paddw, var, reg)
-#define         paddw_r2r(regs,regd)        mmx_r2r (paddw, regs, regd)
-
-#define         paddsb_m2r(var,reg)         mmx_m2r (paddsb, var, reg)
-#define         paddsb_r2r(regs,regd)       mmx_r2r (paddsb, regs, regd)
-#define         paddsw_m2r(var,reg)         mmx_m2r (paddsw, var, reg)
-#define         paddsw_r2r(regs,regd)       mmx_r2r (paddsw, regs, regd)
-
-#define         paddusb_m2r(var,reg)        mmx_m2r (paddusb, var, reg)
-#define         paddusb_r2r(regs,regd)      mmx_r2r (paddusb, regs, regd)
-#define         paddusw_m2r(var,reg)        mmx_m2r (paddusw, var, reg)
-#define         paddusw_r2r(regs,regd)      mmx_r2r (paddusw, regs, regd)
-
-#define         pand_m2r(var,reg)           mmx_m2r (pand, var, reg)
-#define         pand_r2r(regs,regd)         mmx_r2r (pand, regs, regd)
-
-#define         pandn_m2r(var,reg)          mmx_m2r (pandn, var, reg)
-#define         pandn_r2r(regs,regd)        mmx_r2r (pandn, regs, regd)
-
-#define         pcmpeqb_m2r(var,reg)        mmx_m2r (pcmpeqb, var, reg)
-#define         pcmpeqb_r2r(regs,regd)      mmx_r2r (pcmpeqb, regs, regd)
-#define         pcmpeqd_m2r(var,reg)        mmx_m2r (pcmpeqd, var, reg)
-#define         pcmpeqd_r2r(regs,regd)      mmx_r2r (pcmpeqd, regs, regd)
-#define         pcmpeqw_m2r(var,reg)        mmx_m2r (pcmpeqw, var, reg)
-#define         pcmpeqw_r2r(regs,regd)      mmx_r2r (pcmpeqw, regs, regd)
-
-#define         pcmpgtb_m2r(var,reg)        mmx_m2r (pcmpgtb, var, reg)
-#define         pcmpgtb_r2r(regs,regd)      mmx_r2r (pcmpgtb, regs, regd)
-#define         pcmpgtd_m2r(var,reg)        mmx_m2r (pcmpgtd, var, reg)
-#define         pcmpgtd_r2r(regs,regd)      mmx_r2r (pcmpgtd, regs, regd)
-#define         pcmpgtw_m2r(var,reg)        mmx_m2r (pcmpgtw, var, reg)
-#define         pcmpgtw_r2r(regs,regd)      mmx_r2r (pcmpgtw, regs, regd)
-
-#define         pmaddwd_m2r(var,reg)        mmx_m2r (pmaddwd, var, reg)
-#define         pmaddwd_r2r(regs,regd)      mmx_r2r (pmaddwd, regs, regd)
-
-#define         pmulhw_m2r(var,reg)         mmx_m2r (pmulhw, var, reg)
-#define         pmulhw_r2r(regs,regd)       mmx_r2r (pmulhw, regs, regd)
-
-#define         pmullw_m2r(var,reg)         mmx_m2r (pmullw, var, reg)
-#define         pmullw_r2r(regs,regd)       mmx_r2r (pmullw, regs, regd)
-
-#define         por_m2r(var,reg)            mmx_m2r (por, var, reg)
-#define         por_r2r(regs,regd)          mmx_r2r (por, regs, regd)
-
-#define         pslld_i2r(imm,reg)          mmx_i2r (pslld, imm, reg)
-#define         pslld_m2r(var,reg)          mmx_m2r (pslld, var, reg)
-#define         pslld_r2r(regs,regd)        mmx_r2r (pslld, regs, regd)
-#define         psllq_i2r(imm,reg)          mmx_i2r (psllq, imm, reg)
-#define         psllq_m2r(var,reg)          mmx_m2r (psllq, var, reg)
-#define         psllq_r2r(regs,regd)        mmx_r2r (psllq, regs, regd)
-#define         psllw_i2r(imm,reg)          mmx_i2r (psllw, imm, reg)
-#define         psllw_m2r(var,reg)          mmx_m2r (psllw, var, reg)
-#define         psllw_r2r(regs,regd)        mmx_r2r (psllw, regs, regd)
-
-#define         psrad_i2r(imm,reg)          mmx_i2r (psrad, imm, reg)
-#define         psrad_m2r(var,reg)          mmx_m2r (psrad, var, reg)
-#define         psrad_r2r(regs,regd)        mmx_r2r (psrad, regs, regd)
-#define         psraw_i2r(imm,reg)          mmx_i2r (psraw, imm, reg)
-#define         psraw_m2r(var,reg)          mmx_m2r (psraw, var, reg)
-#define         psraw_r2r(regs,regd)        mmx_r2r (psraw, regs, regd)
-
-#define         psrld_i2r(imm,reg)          mmx_i2r (psrld, imm, reg)
-#define         psrld_m2r(var,reg)          mmx_m2r (psrld, var, reg)
-#define         psrld_r2r(regs,regd)        mmx_r2r (psrld, regs, regd)
-#define         psrlq_i2r(imm,reg)          mmx_i2r (psrlq, imm, reg)
-#define         psrlq_m2r(var,reg)          mmx_m2r (psrlq, var, reg)
-#define         psrlq_r2r(regs,regd)        mmx_r2r (psrlq, regs, regd)
-#define         psrlw_i2r(imm,reg)          mmx_i2r (psrlw, imm, reg)
-#define         psrlw_m2r(var,reg)          mmx_m2r (psrlw, var, reg)
-#define         psrlw_r2r(regs,regd)        mmx_r2r (psrlw, regs, regd)
-
-#define         psubb_m2r(var,reg)          mmx_m2r (psubb, var, reg)
-#define         psubb_r2r(regs,regd)        mmx_r2r (psubb, regs, regd)
-#define         psubd_m2r(var,reg)          mmx_m2r (psubd, var, reg)
-#define         psubd_r2r(regs,regd)        mmx_r2r (psubd, regs, regd)
-#define         psubw_m2r(var,reg)          mmx_m2r (psubw, var, reg)
-#define         psubw_r2r(regs,regd)        mmx_r2r (psubw, regs, regd)
-
-#define         psubsb_m2r(var,reg)         mmx_m2r (psubsb, var, reg)
-#define         psubsb_r2r(regs,regd)       mmx_r2r (psubsb, regs, regd)
-#define         psubsw_m2r(var,reg)         mmx_m2r (psubsw, var, reg)
-#define         psubsw_r2r(regs,regd)       mmx_r2r (psubsw, regs, regd)
-
-#define         psubusb_m2r(var,reg)        mmx_m2r (psubusb, var, reg)
-#define         psubusb_r2r(regs,regd)      mmx_r2r (psubusb, regs, regd)
-#define         psubusw_m2r(var,reg)        mmx_m2r (psubusw, var, reg)
-#define         psubusw_r2r(regs,regd)      mmx_r2r (psubusw, regs, regd)
-
-#define         punpckhbw_m2r(var,reg)      mmx_m2r (punpckhbw, var, reg)
-#define         punpckhbw_r2r(regs,regd)    mmx_r2r (punpckhbw, regs, regd)
-#define         punpckhdq_m2r(var,reg)      mmx_m2r (punpckhdq, var, reg)
-#define         punpckhdq_r2r(regs,regd)    mmx_r2r (punpckhdq, regs, regd)
-#define         punpckhwd_m2r(var,reg)      mmx_m2r (punpckhwd, var, reg)
-#define         punpckhwd_r2r(regs,regd)    mmx_r2r (punpckhwd, regs, regd)
-
-#define         punpcklbw_m2r(var,reg)      mmx_m2r (punpcklbw, var, reg)
-#define         punpcklbw_r2r(regs,regd)    mmx_r2r (punpcklbw, regs, regd)
-#define         punpckldq_m2r(var,reg)      mmx_m2r (punpckldq, var, reg)
-#define         punpckldq_r2r(regs,regd)    mmx_r2r (punpckldq, regs, regd)
-#define         punpcklwd_m2r(var,reg)      mmx_m2r (punpcklwd, var, reg)
-#define         punpcklwd_r2r(regs,regd)    mmx_r2r (punpcklwd, regs, regd)
-
-#define         pxor_m2r(var,reg)           mmx_m2r (pxor, var, reg)
-#define         pxor_r2r(regs,regd)         mmx_r2r (pxor, regs, regd)
-
-
-/* 3DNOW extensions */
-
-#define         pavgusb_m2r(var,reg)        mmx_m2r (pavgusb, var, reg)
-#define         pavgusb_r2r(regs,regd)      mmx_r2r (pavgusb, regs, regd)
-
-
-/* AMD MMX extensions - also available in intel SSE */
-
-
-#define         mmx_m2ri(op,mem,reg,imm) \
-        __asm__ volatile (#op " %1, %0, %%" #reg \
-                              : /* nothing */ \
-                              : "m" (mem), "i" (imm))
-#define         mmx_r2ri(op,regs,regd,imm) \
-        __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \
-                              : /* nothing */ \
-                              : "i" (imm) )
-
-#define         mmx_fetch(mem,hint) \
-        __asm__ volatile ("prefetch" #hint " %0" \
-                              : /* nothing */ \
-                              : "m" (mem))
-
-
-#define         maskmovq(regs,maskreg)      mmx_r2ri (maskmovq, regs, maskreg)
-
-#define         movntq_r2m(mmreg,var)       mmx_r2m (movntq, mmreg, var)
-
-#define         pavgb_m2r(var,reg)          mmx_m2r (pavgb, var, reg)
-#define         pavgb_r2r(regs,regd)        mmx_r2r (pavgb, regs, regd)
-#define         pavgw_m2r(var,reg)          mmx_m2r (pavgw, var, reg)
-#define         pavgw_r2r(regs,regd)        mmx_r2r (pavgw, regs, regd)
-
-#define         pextrw_r2r(mmreg,reg,imm)   mmx_r2ri (pextrw, mmreg, reg, imm)
-
-#define         pinsrw_r2r(reg,mmreg,imm)   mmx_r2ri (pinsrw, reg, mmreg, imm)
-
-#define         pmaxsw_m2r(var,reg)         mmx_m2r (pmaxsw, var, reg)
-#define         pmaxsw_r2r(regs,regd)       mmx_r2r (pmaxsw, regs, regd)
-
-#define         pmaxub_m2r(var,reg)         mmx_m2r (pmaxub, var, reg)
-#define         pmaxub_r2r(regs,regd)       mmx_r2r (pmaxub, regs, regd)
-
-#define         pminsw_m2r(var,reg)         mmx_m2r (pminsw, var, reg)
-#define         pminsw_r2r(regs,regd)       mmx_r2r (pminsw, regs, regd)
-
-#define         pminub_m2r(var,reg)         mmx_m2r (pminub, var, reg)
-#define         pminub_r2r(regs,regd)       mmx_r2r (pminub, regs, regd)
-
-#define         pmovmskb(mmreg,reg) \
-        __asm__ volatile ("movmskps %" #mmreg ", %" #reg)
-
-#define         pmulhuw_m2r(var,reg)        mmx_m2r (pmulhuw, var, reg)
-#define         pmulhuw_r2r(regs,regd)      mmx_r2r (pmulhuw, regs, regd)
-
-#define         prefetcht0(mem)             mmx_fetch (mem, t0)
-#define         prefetcht1(mem)             mmx_fetch (mem, t1)
-#define         prefetcht2(mem)             mmx_fetch (mem, t2)
-#define         prefetchnta(mem)            mmx_fetch (mem, nta)
-
-#define         psadbw_m2r(var,reg)         mmx_m2r (psadbw, var, reg)
-#define         psadbw_r2r(regs,regd)       mmx_r2r (psadbw, regs, regd)
-
-#define         pshufw_m2r(var,reg,imm)     mmx_m2ri(pshufw, var, reg, imm)
-#define         pshufw_r2r(regs,regd,imm)   mmx_r2ri(pshufw, regs, regd, imm)
-
-#define         sfence() __asm__ volatile ("sfence\n\t")
-
-/* SSE2 */
-#define         pshufhw_m2r(var,reg,imm)    mmx_m2ri(pshufhw, var, reg, imm)
-#define         pshufhw_r2r(regs,regd,imm)  mmx_r2ri(pshufhw, regs, regd, imm)
-#define         pshuflw_m2r(var,reg,imm)    mmx_m2ri(pshuflw, var, reg, imm)
-#define         pshuflw_r2r(regs,regd,imm)  mmx_r2ri(pshuflw, regs, regd, imm)
-
-#define         pshufd_r2r(regs,regd,imm)   mmx_r2ri(pshufd, regs, regd, imm)
-
-#define         movdqa_m2r(var,reg)         mmx_m2r (movdqa, var, reg)
-#define         movdqa_r2m(reg,var)         mmx_r2m (movdqa, reg, var)
-#define         movdqa_r2r(regs,regd)       mmx_r2r (movdqa, regs, regd)
-#define         movdqu_m2r(var,reg)         mmx_m2r (movdqu, var, reg)
-#define         movdqu_r2m(reg,var)         mmx_r2m (movdqu, reg, var)
-#define         movdqu_r2r(regs,regd)       mmx_r2r (movdqu, regs, regd)
-
-#define         pmullw_r2m(reg,var)         mmx_r2m (pmullw, reg, var)
-
-#define         pslldq_i2r(imm,reg)         mmx_i2r (pslldq, imm, reg)
-#define         psrldq_i2r(imm,reg)         mmx_i2r (psrldq, imm, reg)
-
-#define         punpcklqdq_r2r(regs,regd)   mmx_r2r (punpcklqdq, regs, regd)
-#define         punpckhqdq_r2r(regs,regd)   mmx_r2r (punpckhqdq, regs, regd)
-
-
-#endif /* AVCODEC_I386_MMX_H */
diff --git a/libavcodec/i386/simple_idct_mmx.c b/libavcodec/i386/simple_idct_mmx.c
deleted file mode 100644
index 6306fcb..0000000
--- a/libavcodec/i386/simple_idct_mmx.c
+++ /dev/null
@@ -1,1294 +0,0 @@
-/*
- * Simple IDCT MMX
- *
- * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni at gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "libavcodec/dsputil.h"
-#include "libavcodec/simple_idct.h"
-
-/*
-23170.475006
-22725.260826
-21406.727617
-19265.545870
-16384.000000
-12872.826198
-8866.956905
-4520.335430
-*/
-#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#if 0
-#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#else
-#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
-#endif
-#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-
-#define ROW_SHIFT 11
-#define COL_SHIFT 20 // 6
-
-DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
-DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
-
-DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
-        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
-//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
-//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
-        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
-        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
-//        0, 0, 0, 0,
-//        0, 0, 0, 0,
-
- C4,  C4,  C4,  C4,
- C4, -C4,  C4, -C4,
-
- C2,  C6,  C2,  C6,
- C6, -C2,  C6, -C2,
-
- C1,  C3,  C1,  C3,
- C5,  C7,  C5,  C7,
-
- C3, -C7,  C3, -C7,
--C1, -C5, -C1, -C5,
-
- C5, -C1,  C5, -C1,
- C7,  C3,  C7,  C3,
-
- C7, -C5,  C7, -C5,
- C3, -C1,  C3, -C1
-};
-
-#if 0
-static void unused_var_killer(){
-        int a= wm1010 + d40000;
-        temp[0]=a;
-}
-
-static void inline idctCol (int16_t * col, int16_t *input)
-{
-#undef C0
-#undef C1
-#undef C2
-#undef C3
-#undef C4
-#undef C5
-#undef C6
-#undef C7
-        int a0, a1, a2, a3, b0, b1, b2, b3;
-        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-/*
-        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
-                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
-                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
-                return;
-        }*/
-
-col[8*0] = input[8*0 + 0];
-col[8*1] = input[8*2 + 0];
-col[8*2] = input[8*0 + 1];
-col[8*3] = input[8*2 + 1];
-col[8*4] = input[8*4 + 0];
-col[8*5] = input[8*6 + 0];
-col[8*6] = input[8*4 + 1];
-col[8*7] = input[8*6 + 1];
-
-        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
-        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
-        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
-        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
-
-        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
-        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
-        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
-        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
-
-        col[8*0] = (a0 + b0) >> COL_SHIFT;
-        col[8*1] = (a1 + b1) >> COL_SHIFT;
-        col[8*2] = (a2 + b2) >> COL_SHIFT;
-        col[8*3] = (a3 + b3) >> COL_SHIFT;
-        col[8*4] = (a3 - b3) >> COL_SHIFT;
-        col[8*5] = (a2 - b2) >> COL_SHIFT;
-        col[8*6] = (a1 - b1) >> COL_SHIFT;
-        col[8*7] = (a0 - b0) >> COL_SHIFT;
-}
-
-static void inline idctRow (int16_t * output, int16_t * input)
-{
-        int16_t row[8];
-
-        int a0, a1, a2, a3, b0, b1, b2, b3;
-        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-
-row[0] = input[0];
-row[2] = input[1];
-row[4] = input[4];
-row[6] = input[5];
-row[1] = input[8];
-row[3] = input[9];
-row[5] = input[12];
-row[7] = input[13];
-
-        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
-                row[0] = row[1] = row[2] = row[3] = row[4] =
-                        row[5] = row[6] = row[7] = row[0]<<3;
-        output[0]  = row[0];
-        output[2]  = row[1];
-        output[4]  = row[2];
-        output[6]  = row[3];
-        output[8]  = row[4];
-        output[10] = row[5];
-        output[12] = row[6];
-        output[14] = row[7];
-                return;
-        }
-
-        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
-        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
-        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
-        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
-
-        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
-        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
-        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
-        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
-
-        row[0] = (a0 + b0) >> ROW_SHIFT;
-        row[1] = (a1 + b1) >> ROW_SHIFT;
-        row[2] = (a2 + b2) >> ROW_SHIFT;
-        row[3] = (a3 + b3) >> ROW_SHIFT;
-        row[4] = (a3 - b3) >> ROW_SHIFT;
-        row[5] = (a2 - b2) >> ROW_SHIFT;
-        row[6] = (a1 - b1) >> ROW_SHIFT;
-        row[7] = (a0 - b0) >> ROW_SHIFT;
-
-        output[0]  = row[0];
-        output[2]  = row[1];
-        output[4]  = row[2];
-        output[6]  = row[3];
-        output[8]  = row[4];
-        output[10] = row[5];
-        output[12] = row[6];
-        output[14] = row[7];
-}
-#endif
-
-static inline void idct(int16_t *block)
-{
-        DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
-        int16_t * const temp= (int16_t*)align_tmp;
-
-        __asm__ volatile(
-#if 0 //Alternative, simpler variant
-
-#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        #rounder ", %%mm4               \n\t"\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        #rounder ", %%mm0               \n\t"\
-        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
-        "paddd %%mm0, %%mm0             \n\t" \
-        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
-        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
-        "movq %%mm7, " #dst "           \n\t"\
-        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "movq %%mm2, 24+" #dst "        \n\t"\
-        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
-        "movq %%mm2, 8+" #dst "         \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
-        "movq %%mm4, 16+" #dst "        \n\t"\
-
-#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
-        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
-        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm7, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm2, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm2, 32+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm4, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"\
-
-
-#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
-        "pand %%mm0, %%mm4              \n\t"\
-        "por %%mm1, %%mm4               \n\t"\
-        "por %%mm2, %%mm4               \n\t"\
-        "por %%mm3, %%mm4               \n\t"\
-        "packssdw %%mm4,%%mm4           \n\t"\
-        "movd %%mm4, %%eax              \n\t"\
-        "orl %%eax, %%eax               \n\t"\
-        "jz 1f                          \n\t"\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        #rounder ", %%mm4               \n\t"\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        #rounder ", %%mm0               \n\t"\
-        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
-        "paddd %%mm0, %%mm0             \n\t" \
-        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
-        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
-        "movq %%mm7, " #dst "           \n\t"\
-        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "movq %%mm2, 24+" #dst "        \n\t"\
-        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
-        "movq %%mm2, 8+" #dst "         \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
-        "movq %%mm4, 16+" #dst "        \n\t"\
-        "jmp 2f                         \n\t"\
-        "1:                             \n\t"\
-        "pslld $16, %%mm0               \n\t"\
-        "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
-        "psrad $13, %%mm0               \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t"\
-        "movq %%mm0, " #dst "           \n\t"\
-        "movq %%mm0, 8+" #dst "         \n\t"\
-        "movq %%mm0, 16+" #dst "        \n\t"\
-        "movq %%mm0, 24+" #dst "        \n\t"\
-        "2:                             \n\t"
-
-
-//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
-ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
-/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
-ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
-ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
-
-DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
-DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
-DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
-
-
-//IDCT(      src0,   src4,   src1,    src5,    dst, shift)
-COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-
-#else
-
-#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
-        "pand %%mm0, %%mm4              \n\t"\
-        "por %%mm1, %%mm4               \n\t"\
-        "por %%mm2, %%mm4               \n\t"\
-        "por %%mm3, %%mm4               \n\t"\
-        "packssdw %%mm4,%%mm4           \n\t"\
-        "movd %%mm4, %%eax              \n\t"\
-        "orl %%eax, %%eax               \n\t"\
-        "jz 1f                          \n\t"\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        #rounder ", %%mm4               \n\t"\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        #rounder ", %%mm0               \n\t"\
-        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
-        "paddd %%mm0, %%mm0             \n\t" \
-        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
-        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
-        "movq %%mm7, " #dst "           \n\t"\
-        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "movq %%mm2, 24+" #dst "        \n\t"\
-        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
-        "movq %%mm2, 8+" #dst "         \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
-        "movq %%mm4, 16+" #dst "        \n\t"\
-        "jmp 2f                         \n\t"\
-        "1:                             \n\t"\
-        "pslld $16, %%mm0               \n\t"\
-        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
-        "psrad $13, %%mm0               \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t"\
-        "movq %%mm0, " #dst "           \n\t"\
-        "movq %%mm0, 8+" #dst "         \n\t"\
-        "movq %%mm0, 16+" #dst "        \n\t"\
-        "movq %%mm0, 24+" #dst "        \n\t"\
-        "2:                             \n\t"
-
-#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq %%mm0, %%mm4              \n\t"\
-        "por %%mm1, %%mm4               \n\t"\
-        "por %%mm2, %%mm4               \n\t"\
-        "por %%mm3, %%mm4               \n\t"\
-        "packssdw %%mm4,%%mm4           \n\t"\
-        "movd %%mm4, %%eax              \n\t"\
-        "orl %%eax, %%eax               \n\t"\
-        "jz " #bt "                     \n\t"\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        #rounder ", %%mm4               \n\t"\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        #rounder ", %%mm0               \n\t"\
-        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
-        "paddd %%mm0, %%mm0             \n\t" \
-        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
-        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
-        "movq %%mm7, " #dst "           \n\t"\
-        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "movq %%mm2, 24+" #dst "        \n\t"\
-        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
-        "movq %%mm2, 8+" #dst "         \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
-        "movq %%mm4, 16+" #dst "        \n\t"\
-
-#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        #rounder ", %%mm4               \n\t"\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        #rounder ", %%mm0               \n\t"\
-        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
-        "paddd %%mm0, %%mm0             \n\t" \
-        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
-        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
-        "movq %%mm7, " #dst "           \n\t"\
-        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "movq %%mm2, 24+" #dst "        \n\t"\
-        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
-        "movq %%mm2, 8+" #dst "         \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
-        "movq %%mm4, 16+" #dst "        \n\t"\
-
-//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
-DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
-Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
-Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
-Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
-        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
-        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm7, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm2, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm2, 32+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm4, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-        "#" ASMALIGN(4)                      \
-        "4:                             \n\t"
-Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
-Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
-        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
-        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm1, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm2, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm2, 32+" #dst "        \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm1, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-        "#" ASMALIGN(4)                      \
-        "6:                             \n\t"
-Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm1, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm2, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm2, 32+" #dst "        \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm1, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-        "#" ASMALIGN(4)                      \
-        "2:                             \n\t"
-Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
-
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
-        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
-        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
-        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm7, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm2, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
-        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
-        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
-        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
-        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
-        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
-        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm2, 32+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm4, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-        "#" ASMALIGN(4)                      \
-        "3:                             \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 64(%2), %%mm3             \n\t"\
-        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
-        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm7, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm1, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
-        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm1, 32+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "movd %%mm4, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-        "#" ASMALIGN(4)                      \
-        "5:                             \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
-        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
-        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
-        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
-        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
-        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
-        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
-        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
-        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
-        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm3       \n\t"\
-        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
-        "movq %%mm4, " #dst "           \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
-        "movq %%mm0, 16+" #dst "        \n\t"\
-        "movq %%mm0, 96+" #dst "        \n\t"\
-        "movq %%mm4, 112+" #dst "       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movq %%mm5, 32+" #dst "        \n\t"\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movq %%mm6, 48+" #dst "        \n\t"\
-        "movq %%mm6, 64+" #dst "        \n\t"\
-        "movq %%mm5, 80+" #dst "        \n\t"
-
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-
-        "#" ASMALIGN(4)                      \
-        "1:                             \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
-        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
-        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
-        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
-        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
-        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
-        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
-        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
-        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
-        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
-        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
-        "movq 64(%2), %%mm1             \n\t"\
-        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
-        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
-        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
-        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
-        "psrad $" #shift ", %%mm7       \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
-        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
-        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "psrad $" #shift ", %%mm3       \n\t"\
-        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
-        "movd %%mm7, " #dst "           \n\t"\
-        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
-        "movd %%mm0, 16+" #dst "        \n\t"\
-        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
-        "movd %%mm3, 96+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
-        "movd %%mm4, 112+" #dst "       \n\t"\
-        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
-        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
-        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
-        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
-        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
-        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
-        "psrad $" #shift ", %%mm3       \n\t"\
-        "psrad $" #shift ", %%mm5       \n\t"\
-        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
-        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
-        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
-        "psrad $" #shift ", %%mm6       \n\t"\
-        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
-        "movd %%mm3, 32+" #dst "        \n\t"\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
-        "movd %%mm6, 48+" #dst "        \n\t"\
-        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
-        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
-        "movd %%mm4, 64+" #dst "        \n\t"\
-        "movd %%mm5, 80+" #dst "        \n\t"
-
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-        "jmp 9f                         \n\t"
-
-
-        "#" ASMALIGN(4)
-        "7:                             \n\t"
-#undef IDCT
-#define IDCT(src0, src4, src1, src5, dst, shift) \
-        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
-        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "psrad $" #shift ", %%mm4       \n\t"\
-        "psrad $" #shift ", %%mm0       \n\t"\
-        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
-        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
-        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
-        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
-        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
-        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
-        "psrad $" #shift ", %%mm1       \n\t"\
-        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
-        "movq %%mm4, " #dst "           \n\t"\
-        "psrad $" #shift ", %%mm2       \n\t"\
-        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
-        "movq %%mm0, 16+" #dst "        \n\t"\
-        "movq %%mm0, 96+" #dst "        \n\t"\
-        "movq %%mm4, 112+" #dst "       \n\t"\
-        "movq %%mm0, 32+" #dst "        \n\t"\
-        "movq %%mm4, 48+" #dst "        \n\t"\
-        "movq %%mm4, 64+" #dst "        \n\t"\
-        "movq %%mm0, 80+" #dst "        \n\t"
-
-//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-
-
-#endif
-
-/*
-Input
- 00 40 04 44 20 60 24 64
- 10 30 14 34 50 70 54 74
- 01 41 03 43 21 61 23 63
- 11 31 13 33 51 71 53 73
- 02 42 06 46 22 62 26 66
- 12 32 16 36 52 72 56 76
- 05 45 07 47 25 65 27 67
- 15 35 17 37 55 75 57 77
-
-Temp
- 00 04 10 14 20 24 30 34
- 40 44 50 54 60 64 70 74
- 01 03 11 13 21 23 31 33
- 41 43 51 53 61 63 71 73
- 02 06 12 16 22 26 32 36
- 42 46 52 56 62 66 72 76
- 05 07 15 17 25 27 35 37
- 45 47 55 57 65 67 75 77
-*/
-
-"9: \n\t"
-                :: "r" (block), "r" (temp), "r" (coeffs)
-                : "%eax"
-        );
-}
-
-void ff_simple_idct_mmx(int16_t *block)
-{
-    idct(block);
-}
-
-//FIXME merge add/put into the idct
-
-void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    idct(block);
-    put_pixels_clamped_mmx(block, dest, line_size);
-}
-void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
-{
-    idct(block);
-    add_pixels_clamped_mmx(block, dest, line_size);
-}
diff --git a/libavcodec/i386/vp3dsp_mmx.h b/libavcodec/i386/vp3dsp_mmx.h
deleted file mode 100644
index 2e79913..0000000
--- a/libavcodec/i386/vp3dsp_mmx.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * vp3dsp MMX function declarations
- * Copyright (c) 2007 Aurelien Jacobs <aurel at gnuage.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_I386_VP3DSP_MMX_H
-#define AVCODEC_I386_VP3DSP_MMX_H
-
-#include <stdint.h>
-#include "libavcodec/dsputil.h"
-
-void ff_vp3_idct_mmx(int16_t *data);
-void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
-void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
-
-void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
-void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
-
-#endif /* AVCODEC_I386_VP3DSP_MMX_H */
diff --git a/libavcodec/i386/vp3dsp_sse2.h b/libavcodec/i386/vp3dsp_sse2.h
deleted file mode 100644
index 55908c2..0000000
--- a/libavcodec/i386/vp3dsp_sse2.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * vp3dsp SSE2 function declarations
- * Copyright (c) 2007 Aurelien Jacobs <aurel at gnuage.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_I386_VP3DSP_SSE2_H
-#define AVCODEC_I386_VP3DSP_SSE2_H
-
-#include "libavcodec/dsputil.h"
-
-void ff_vp3_idct_sse2(int16_t *input_data);
-void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
-void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
-
-#endif /* AVCODEC_I386_VP3DSP_SSE2_H */
diff --git a/libavcodec/i386/x86inc.asm b/libavcodec/i386/x86inc.asm
deleted file mode 100644
index 54c4679..0000000
--- a/libavcodec/i386/x86inc.asm
+++ /dev/null
@@ -1,546 +0,0 @@
-;*****************************************************************************
-;* x86inc.asm
-;*****************************************************************************
-;* Copyright (C) 2005-2008 Loren Merritt <lorenm at u.washington.edu>
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;*****************************************************************************
-
-; FIXME: All of the 64bit asm functions that take a stride as an argument
-; via register, assume that the high dword of that register is filled with 0.
-; This is true in practice (since we never do any 64bit arithmetic on strides,
-; and x264's strides are all positive), but is not guaranteed by the ABI.
-
-; Name of the .rodata section.
-; Kludge: Something on OS X fails to align .rodata even given an align attribute,
-; so use a different read-only section.
-%macro SECTION_RODATA 0
-    %ifidn __OUTPUT_FORMAT__,macho64
-        SECTION .text align=16
-    %elifidn __OUTPUT_FORMAT__,macho
-        SECTION .text align=16
-        fakegot:
-    %else
-        SECTION .rodata align=16
-    %endif
-%endmacro
-
-; PIC support macros. All these macros are totally harmless when PIC is
-; not defined but can ruin everything if misused in PIC mode. On x86_32, shared
-; objects cannot directly access global variables by address, they need to
-; go through the GOT (global offset table). Most OSes do not care about it
-; and let you load non-shared .so objects (Linux, Win32...). However, OS X
-; requires PIC code in its .dylib objects.
-;
-; - GLOBAL should be used as a suffix for global addressing, eg.
-;     picgetgot ebx
-;     mov eax, [foo GLOBAL]
-;   instead of
-;     mov eax, [foo]
-;
-; - picgetgot computes the GOT address into the given register in PIC
-;   mode, otherwise does nothing. You need to do this before using GLOBAL.
-;   Before in both execution order and compiled code order (so GLOBAL knows
-;   which register the GOT is in).
-
-%ifndef PIC
-    %define GLOBAL
-    %macro picgetgot 1
-    %endmacro
-%elifdef ARCH_X86_64
-    %define PIC64
-    %define GLOBAL wrt rip
-    %macro picgetgot 1
-    %endmacro
-%else
-    %define PIC32
-    %ifidn __OUTPUT_FORMAT__,macho
-        ; There is no real global offset table on OS X, but we still
-        ; need to reference our variables by offset.
-        %macro picgetgot 1
-            call %%getgot
-          %%getgot:
-            pop %1
-            add %1, $$ - %%getgot
-            %undef GLOBAL
-            %define GLOBAL + %1 - fakegot
-        %endmacro
-    %else ; elf
-        extern _GLOBAL_OFFSET_TABLE_
-        %macro picgetgot 1
-            call %%getgot
-          %%getgot:
-            pop %1
-            add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
-            %undef GLOBAL
-            %define GLOBAL + %1 wrt ..gotoff
-        %endmacro
-    %endif
-%endif
-
-; Macros to eliminate most code duplication between x86_32 and x86_64:
-; Currently this works only for leaf functions which load all their arguments
-; into registers at the start, and make no other use of the stack. Luckily that
-; covers most of x264's asm.
-
-; PROLOGUE:
-; %1 = number of arguments. loads them from stack if needed.
-; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
-; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
-; %4 = list of names to define to registers
-; PROLOGUE can also be invoked by adding the same options to cglobal
-
-; e.g.
-; cglobal foo, 2,3,0, dst, src, tmp
-; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals
-
-; TODO Some functions can use some args directly from the stack. If they're the
-; last args then you can just not declare them, but if they're in the middle
-; we need more flexible macro.
-
-; RET:
-; Pops anything that was pushed by PROLOGUE
-
-; REP_RET:
-; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
-; which are slow when a normal ret follows a branch.
-
-%macro DECLARE_REG 6
-    %define r%1q %2
-    %define r%1d %3
-    %define r%1w %4
-    %define r%1b %5
-    %define r%1m %6
-    %define r%1  %2
-%endmacro
-
-%macro DECLARE_REG_SIZE 2
-    %define r%1q r%1
-    %define e%1q r%1
-    %define r%1d e%1
-    %define e%1d e%1
-    %define r%1w %1
-    %define e%1w %1
-    %define r%1b %2
-    %define e%1b %2
-%ifndef ARCH_X86_64
-    %define r%1  e%1
-%endif
-%endmacro
-
-DECLARE_REG_SIZE ax, al
-DECLARE_REG_SIZE bx, bl
-DECLARE_REG_SIZE cx, cl
-DECLARE_REG_SIZE dx, dl
-DECLARE_REG_SIZE si, sil
-DECLARE_REG_SIZE di, dil
-DECLARE_REG_SIZE bp, bpl
-
-%ifdef ARCH_X86_64
-    %define gprsize 8
-%else
-    %define gprsize 4
-%endif
-
-%macro PUSH 1
-    push %1
-    %assign stack_offset stack_offset+gprsize
-%endmacro
-
-%macro POP 1
-    pop %1
-    %assign stack_offset stack_offset-gprsize
-%endmacro
-
-%macro SUB 2
-    sub %1, %2
-    %ifidn %1, rsp
-        %assign stack_offset stack_offset+(%2)
-    %endif
-%endmacro
-
-%macro ADD 2
-    add %1, %2
-    %ifidn %1, rsp
-        %assign stack_offset stack_offset-(%2)
-    %endif
-%endmacro
-
-%macro movifnidn 2
-    %ifnidn %1, %2
-        mov %1, %2
-    %endif
-%endmacro
-
-%macro movsxdifnidn 2
-    %ifnidn %1, %2
-        movsxd %1, %2
-    %endif
-%endmacro
-
-%macro ASSERT 1
-    %if (%1) == 0
-        %error assert failed
-    %endif
-%endmacro
-
-%macro DEFINE_ARGS 0-*
-    %ifdef n_arg_names
-        %assign %%i 0
-        %rep n_arg_names
-            CAT_UNDEF arg_name %+ %%i, q
-            CAT_UNDEF arg_name %+ %%i, d
-            CAT_UNDEF arg_name %+ %%i, w
-            CAT_UNDEF arg_name %+ %%i, b
-            CAT_UNDEF arg_name, %%i
-            %assign %%i %%i+1
-        %endrep
-    %endif
-
-    %assign %%i 0
-    %rep %0
-        %xdefine %1q r %+ %%i %+ q
-        %xdefine %1d r %+ %%i %+ d
-        %xdefine %1w r %+ %%i %+ w
-        %xdefine %1b r %+ %%i %+ b
-        CAT_XDEFINE arg_name, %%i, %1
-        %assign %%i %%i+1
-        %rotate 1
-    %endrep
-    %assign n_arg_names %%i
-%endmacro
-
-%ifdef ARCH_X86_64 ;==========================================================
-%ifidn __OUTPUT_FORMAT__,win32
-
-DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
-DECLARE_REG 1, rdx, edx, dx,  dl,  edx
-DECLARE_REG 2, r8,  r8d, r8w, r8b, r8d
-DECLARE_REG 3, r9,  r9d, r9w, r9b, r9d
-DECLARE_REG 4, rdi, edi, di,  dil, [rsp + stack_offset + 40]
-DECLARE_REG 5, rsi, esi, si,  sil, [rsp + stack_offset + 48]
-DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
-%define r7m [rsp + stack_offset + 64]
-%define r8m [rsp + stack_offset + 72]
-
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
-    %if %1 < %2
-        mov r%1, [rsp + 8 + %1*8]
-    %endif
-%endmacro
-
-%else ;=======================================================================
-
-DECLARE_REG 0, rdi, edi, di,  dil, edi
-DECLARE_REG 1, rsi, esi, si,  sil, esi
-DECLARE_REG 2, rdx, edx, dx,  dl,  edx
-DECLARE_REG 3, rcx, ecx, cx,  cl,  ecx
-DECLARE_REG 4, r8,  r8d, r8w, r8b, r8d
-DECLARE_REG 5, r9,  r9d, r9w, r9b, r9d
-DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 8]
-%define r7m [rsp + stack_offset + 16]
-%define r8m [rsp + stack_offset + 24]
-
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
-    %if %1 < %2
-        mov r%1, [rsp - 40 + %1*8]
-    %endif
-%endmacro
-
-%endif ; !WIN64
-
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
-    ASSERT %2 >= %1
-    ASSERT %2 <= 7
-    %assign stack_offset 0
-%ifidn __OUTPUT_FORMAT__,win32
-    LOAD_IF_USED 4, %1
-    LOAD_IF_USED 5, %1
-%endif
-    LOAD_IF_USED 6, %1
-    DEFINE_ARGS %4
-%endmacro
-
-%macro RET 0
-    ret
-%endmacro
-
-%macro REP_RET 0
-    rep ret
-%endmacro
-
-%else ; X86_32 ;==============================================================
-
-DECLARE_REG 0, eax, eax, ax, al,   [esp + stack_offset + 4]
-DECLARE_REG 1, ecx, ecx, cx, cl,   [esp + stack_offset + 8]
-DECLARE_REG 2, edx, edx, dx, dl,   [esp + stack_offset + 12]
-DECLARE_REG 3, ebx, ebx, bx, bl,   [esp + stack_offset + 16]
-DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
-DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
-DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
-%define r7m [esp + stack_offset + 32]
-%define r8m [esp + stack_offset + 36]
-%define rsp esp
-
-%macro PUSH_IF_USED 1 ; reg_id
-    %if %1 < regs_used
-        push r%1
-        %assign stack_offset stack_offset+4
-    %endif
-%endmacro
-
-%macro POP_IF_USED 1 ; reg_id
-    %if %1 < regs_used
-        pop r%1
-    %endif
-%endmacro
-
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
-    %if %1 < %2
-        mov r%1, [esp + stack_offset + 4 + %1*4]
-    %endif
-%endmacro
-
-%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
-    ASSERT %2 >= %1
-    %assign stack_offset 0
-    %assign regs_used %2
-    %ifdef PIC
-    %if %3
-        %assign regs_used regs_used+1
-    %endif
-    %endif
-    ASSERT regs_used <= 7
-    PUSH_IF_USED 3
-    PUSH_IF_USED 4
-    PUSH_IF_USED 5
-    PUSH_IF_USED 6
-    LOAD_IF_USED 0, %1
-    LOAD_IF_USED 1, %1
-    LOAD_IF_USED 2, %1
-    LOAD_IF_USED 3, %1
-    LOAD_IF_USED 4, %1
-    LOAD_IF_USED 5, %1
-    LOAD_IF_USED 6, %1
-    %if %3
-        picgetgot r%2
-    %endif
-    DEFINE_ARGS %4
-%endmacro
-
-%macro RET 0
-    POP_IF_USED 6
-    POP_IF_USED 5
-    POP_IF_USED 4
-    POP_IF_USED 3
-    ret
-%endmacro
-
-%macro REP_RET 0
-    %if regs_used > 3
-        RET
-    %else
-        rep ret
-    %endif
-%endmacro
-
-%endif ;======================================================================
-
-
-
-;=============================================================================
-; arch-independent part
-;=============================================================================
-
-%assign function_align 16
-
-; Symbol prefix for C linkage
-%macro cglobal 1-2+
-    %ifidn __OUTPUT_FORMAT__,elf
-        %ifdef PREFIX
-            global _%1:function hidden
-            %define %1 _%1
-        %else
-            global %1:function hidden
-        %endif
-    %else
-        %ifdef PREFIX
-            global _%1
-            %define %1 _%1
-        %else
-            global %1
-        %endif
-    %endif
-    align function_align
-    %1:
-    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
-    %if %0 > 1
-        PROLOGUE %2
-    %endif
-%endmacro
-
-%macro cextern 1
-    %ifdef PREFIX
-        extern _%1
-        %define %1 _%1
-    %else
-        extern %1
-    %endif
-%endmacro
-
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
-%endif
-
-%assign FENC_STRIDE 16
-%assign FDEC_STRIDE 32
-
-; merge mmx and sse*
-
-%macro CAT_XDEFINE 3
-    %xdefine %1%2 %3
-%endmacro
-
-%macro CAT_UNDEF 2
-    %undef %1%2
-%endmacro
-
-%macro INIT_MMX 0
-    %define RESET_MM_PERMUTATION INIT_MMX
-    %define mmsize 8
-    %define num_mmregs 8
-    %define mova movq
-    %define movu movq
-    %define movh movd
-    %define movnt movntq
-    %assign %%i 0
-    %rep 8
-    CAT_XDEFINE m, %%i, mm %+ %%i
-    CAT_XDEFINE nmm, %%i, %%i
-    %assign %%i %%i+1
-    %endrep
-    %rep 8
-    CAT_UNDEF m, %%i
-    CAT_UNDEF nmm, %%i
-    %assign %%i %%i+1
-    %endrep
-%endmacro
-
-%macro INIT_XMM 0
-    %define RESET_MM_PERMUTATION INIT_XMM
-    %define mmsize 16
-    %define num_mmregs 8
-    %ifdef ARCH_X86_64
-    %define num_mmregs 16
-    %endif
-    %define mova movdqa
-    %define movu movdqu
-    %define movh movq
-    %define movnt movntdq
-    %assign %%i 0
-    %rep num_mmregs
-    CAT_XDEFINE m, %%i, xmm %+ %%i
-    CAT_XDEFINE nxmm, %%i, %%i
-    %assign %%i %%i+1
-    %endrep
-%endmacro
-
-INIT_MMX
-
-; I often want to use macros that permute their arguments. e.g. there's no
-; efficient way to implement butterfly or transpose or dct without swapping some
-; arguments.
-;
-; I would like to not have to manually keep track of the permutations:
-; If I insert a permutation in the middle of a function, it should automatically
-; change everything that follows. For more complex macros I may also have multiple
-; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
-;
-; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
-; permutes its arguments. It's equivalent to exchanging the contents of the
-; registers, except that this way you exchange the register names instead, so it
-; doesn't cost any cycles.
-
-%macro PERMUTE 2-* ; takes a list of pairs to swap
-%rep %0/2
-    %xdefine tmp%2 m%2
-    %xdefine ntmp%2 nm%2
-    %rotate 2
-%endrep
-%rep %0/2
-    %xdefine m%1 tmp%2
-    %xdefine nm%1 ntmp%2
-    %undef tmp%2
-    %undef ntmp%2
-    %rotate 2
-%endrep
-%endmacro
-
-%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
-%rep %0-1
-%ifdef m%1
-    %xdefine tmp m%1
-    %xdefine m%1 m%2
-    %xdefine m%2 tmp
-    CAT_XDEFINE n, m%1, %1
-    CAT_XDEFINE n, m%2, %2
-%else
-    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
-    ; Be careful using this mode in nested macros though, as in some cases there may be
-    ; other copies of m# that have already been dereferenced and don't get updated correctly.
-    %xdefine %%n1 n %+ %1
-    %xdefine %%n2 n %+ %2
-    %xdefine tmp m %+ %%n1
-    CAT_XDEFINE m, %%n1, m %+ %%n2
-    CAT_XDEFINE m, %%n2, tmp
-    CAT_XDEFINE n, m %+ %%n1, %%n1
-    CAT_XDEFINE n, m %+ %%n2, %%n2
-%endif
-    %undef tmp
-    %rotate 1
-%endrep
-%endmacro
-
-%macro SAVE_MM_PERMUTATION 1
-    %assign %%i 0
-    %rep num_mmregs
-    CAT_XDEFINE %1_m, %%i, m %+ %%i
-    %assign %%i %%i+1
-    %endrep
-%endmacro
-
-%macro LOAD_MM_PERMUTATION 1
-    %assign %%i 0
-    %rep num_mmregs
-    CAT_XDEFINE m, %%i, %1_m %+ %%i
-    %assign %%i %%i+1
-    %endrep
-%endmacro
-
-%macro call 1
-    call %1
-    %ifdef %1_m0
-        LOAD_MM_PERMUTATION %1
-    %endif
-%endmacro
-
-; substitutions which are functionally identical but reduce code size
-%define movdqa movaps
-%define movdqu movups
-
diff --git a/libavcodec/idcinvideo.c b/libavcodec/idcinvideo.c
index 43889a6..fa3bb16 100644
--- a/libavcodec/idcinvideo.c
+++ b/libavcodec/idcinvideo.c
@@ -60,7 +60,7 @@ typedef struct
   int count;
   unsigned char used;
   int children[2];
-} hnode_t;
+} hnode;
 
 typedef struct IdcinContext {
 
@@ -70,7 +70,7 @@ typedef struct IdcinContext {
     const unsigned char *buf;
     int size;
 
-    hnode_t huff_nodes[256][HUF_TOKENS*2];
+    hnode huff_nodes[256][HUF_TOKENS*2];
     int num_huff_nodes[256];
 
 } IdcinContext;
@@ -81,7 +81,7 @@ typedef struct IdcinContext {
  * Returns the node index of the lowest unused node, or -1 if all nodes
  * are used.
  */
-static int huff_smallest_node(hnode_t *hnodes, int num_hnodes) {
+static int huff_smallest_node(hnode *hnodes, int num_hnodes) {
     int i;
     int best, best_node;
 
@@ -114,7 +114,7 @@ static int huff_smallest_node(hnode_t *hnodes, int num_hnodes) {
  *    That is: huff_nodes[prev][num_huff_nodes[prev]] is the root node.
  */
 static av_cold void huff_build_tree(IdcinContext *s, int prev) {
-    hnode_t *node, *hnodes;
+    hnode *node, *hnodes;
      int num_hnodes, i;
 
     num_hnodes = HUF_TOKENS;
@@ -173,7 +173,7 @@ static av_cold int idcin_decode_init(AVCodecContext *avctx)
 
 static void idcin_decode_vlcs(IdcinContext *s)
 {
-    hnode_t *hnodes;
+    hnode *hnodes;
     long x, y;
     int prev;
     unsigned char v = 0;
diff --git a/libavcodec/imc.c b/libavcodec/imc.c
index 436a5c9..91d9feb 100644
--- a/libavcodec/imc.c
+++ b/libavcodec/imc.c
@@ -143,7 +143,7 @@ static av_cold int imc_decode_init(AVCodecContext * avctx)
     /* initialize the VLC tables */
     for(i = 0; i < 4 ; i++) {
         for(j = 0; j < 4; j++) {
-            huffman_vlc[i][j].table = vlc_tables[vlc_offsets[i * 4 + j]];
+            huffman_vlc[i][j].table = &vlc_tables[vlc_offsets[i * 4 + j]];
             huffman_vlc[i][j].table_allocated = vlc_offsets[i * 4 + j + 1] - vlc_offsets[i * 4 + j];
             init_vlc(&huffman_vlc[i][j], 9, imc_huffman_sizes[i],
                      imc_huffman_lens[i][j], 1, 1,
@@ -155,6 +155,7 @@ static av_cold int imc_decode_init(AVCodecContext * avctx)
     ff_fft_init(&q->fft, 7, 1);
     dsputil_init(&q->dsp, avctx);
     avctx->sample_fmt = SAMPLE_FMT_S16;
+    avctx->channel_layout = (avctx->channels==2) ? CH_LAYOUT_STEREO : CH_LAYOUT_MONO;
     return 0;
 }
 
diff --git a/libavcodec/imgconvert.c b/libavcodec/imgconvert.c
index 72a5bbe..952ddcb 100644
--- a/libavcodec/imgconvert.c
+++ b/libavcodec/imgconvert.c
@@ -35,7 +35,8 @@
 #include "colorspace.h"
 
 #ifdef HAVE_MMX
-#include "i386/mmx.h"
+#include "x86/mmx.h"
+#include "x86/dsputil_mmx.h"
 #endif
 
 #define xglue(x, y) x ## y
@@ -266,6 +267,9 @@ static const PixFmtInfo pix_fmt_info[PIX_FMT_NB] = {
     [PIX_FMT_XVMC_MPEG2_IDCT] = {
         .name = "xvmcidct",
     },
+    [PIX_FMT_VDPAU_H264] = {
+        .name = "vdpau_h264",
+    },
     [PIX_FMT_UYYVYY411] = {
         .name = "uyyvyy411",
         .nb_channels = 1,
@@ -783,7 +787,7 @@ static int avcodec_find_best_pix_fmt1(int64_t pix_fmt_mask,
     dst_pix_fmt = -1;
     min_dist = 0x7fffffff;
     for(i = 0;i < PIX_FMT_NB; i++) {
-        if (pix_fmt_mask & (1 << i)) {
+        if (pix_fmt_mask & (1ULL << i)) {
             loss = avcodec_get_pix_fmt_loss(i, src_pix_fmt, has_alpha) & loss_mask;
             if (loss == 0) {
                 dist = avg_bits_per_pixel(i);
@@ -2733,13 +2737,8 @@ static void deinterlace_line(uint8_t *dst,
 #else
 
     {
-        mmx_t rounder;
-        rounder.uw[0]=4;
-        rounder.uw[1]=4;
-        rounder.uw[2]=4;
-        rounder.uw[3]=4;
         pxor_r2r(mm7,mm7);
-        movq_m2r(rounder,mm6);
+        movq_m2r(ff_pw_4,mm6);
     }
     for (;size > 3; size-=4) {
         DEINT_LINE_LUM
@@ -2776,13 +2775,8 @@ static void deinterlace_line_inplace(uint8_t *lum_m4, uint8_t *lum_m3, uint8_t *
 #else
 
     {
-        mmx_t rounder;
-        rounder.uw[0]=4;
-        rounder.uw[1]=4;
-        rounder.uw[2]=4;
-        rounder.uw[3]=4;
         pxor_r2r(mm7,mm7);
-        movq_m2r(rounder,mm6);
+        movq_m2r(ff_pw_4,mm6);
     }
     for (;size > 3; size-=4) {
         DEINT_INPLACE_LINE_LUM
diff --git a/libavcodec/imgconvert.h b/libavcodec/imgconvert.h
index bfaa03e..83bce68 100644
--- a/libavcodec/imgconvert.h
+++ b/libavcodec/imgconvert.h
@@ -33,4 +33,7 @@ int ff_fill_pointer(AVPicture *picture, uint8_t *ptr, int pix_fmt, int height);
 
 int ff_get_plane_bytewidth(enum PixelFormat pix_fmt, int width, int plane);
 
+int img_convert(AVPicture *dst, int dst_pix_fmt, const AVPicture *src,
+                int src_pix_fmt, int src_width, int src_height);
+
 #endif /* AVCODEC_IMGCONVERT_H */
diff --git a/libavcodec/imgconvert_template.c b/libavcodec/imgconvert_template.c
index 2d23be8..7b4dbf9 100644
--- a/libavcodec/imgconvert_template.c
+++ b/libavcodec/imgconvert_template.c
@@ -19,10 +19,6 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/* This header intentionally has no multiple inclusion guards. It is meant to
- * be included multiple times and generates different code depending on the
- * value of certain #defines. */
-
 #ifndef RGB_OUT
 #define RGB_OUT(d, r, g, b) RGBA_OUT(d, r, g, b, 0xff)
 #endif
diff --git a/libavcodec/imgresample.c b/libavcodec/imgresample.c
index 3c70c2a..6ca810c 100644
--- a/libavcodec/imgresample.c
+++ b/libavcodec/imgresample.c
@@ -26,6 +26,7 @@
 
 #include "avcodec.h"
 #include "dsputil.h"
+#include "imgconvert.h"
 #include "libswscale/swscale.h"
 
 #ifdef HAVE_ALTIVEC
@@ -154,7 +155,7 @@ static void v_resample(uint8_t *dst, int dst_width, const uint8_t *src,
 
 #ifdef HAVE_MMX
 
-#include "i386/mmx.h"
+#include "x86/mmx.h"
 
 #define FILTER4(reg) \
 {\
@@ -182,7 +183,7 @@ static void h_resample_fast4_mmx(uint8_t *dst, int dst_width,
     int src_pos, phase;
     const uint8_t *s;
     int16_t *filter;
-    mmx_t tmp;
+    uint64_t tmp;
 
     src_pos = src_start;
     pxor_r2r(mm7, mm7);
@@ -199,13 +200,13 @@ static void h_resample_fast4_mmx(uint8_t *dst, int dst_width,
         packuswb_r2r(mm7, mm3);
         packuswb_r2r(mm7, mm2);
         movq_r2m(mm0, tmp);
-        dst[0] = tmp.ub[0];
+        dst[0] = tmp & 0xFF;
         movq_r2m(mm1, tmp);
-        dst[1] = tmp.ub[0];
+        dst[1] = tmp & 0xFF;
         movq_r2m(mm2, tmp);
-        dst[2] = tmp.ub[0];
+        dst[2] = tmp & 0xFF;
         movq_r2m(mm3, tmp);
-        dst[3] = tmp.ub[0];
+        dst[3] = tmp & 0xFF;
         dst += 4;
         dst_width -= 4;
     }
@@ -213,7 +214,7 @@ static void h_resample_fast4_mmx(uint8_t *dst, int dst_width,
         FILTER4(mm0);
         packuswb_r2r(mm7, mm0);
         movq_r2m(mm0, tmp);
-        dst[0] = tmp.ub[0];
+        dst[0] = tmp & 0xFF;
         dst++;
         dst_width--;
     }
@@ -223,17 +224,14 @@ static void h_resample_fast4_mmx(uint8_t *dst, int dst_width,
 static void v_resample4_mmx(uint8_t *dst, int dst_width, const uint8_t *src,
                             int wrap, int16_t *filter)
 {
-    int sum, i, v;
+    int sum, i;
     const uint8_t *s;
-    mmx_t tmp;
-    mmx_t coefs[4];
+    uint64_t tmp;
+    uint64_t coefs[4];
 
     for(i=0;i<4;i++) {
-        v = filter[i];
-        coefs[i].uw[0] = v;
-        coefs[i].uw[1] = v;
-        coefs[i].uw[2] = v;
-        coefs[i].uw[3] = v;
+        tmp = filter[i];
+        coefs[i] = (tmp<<48) + (tmp<<32) + (tmp<<16) + tmp;
     }
 
     pxor_r2r(mm7, mm7);
@@ -261,7 +259,7 @@ static void v_resample4_mmx(uint8_t *dst, int dst_width, const uint8_t *src,
         packuswb_r2r(mm7, mm0);
         movq_r2m(mm0, tmp);
 
-        *(uint32_t *)dst = tmp.ud[0];
+        *(uint32_t *)dst = tmp & 0xFFFFFFFF;
         dst += 4;
         s += 4;
         dst_width -= 4;
diff --git a/libavcodec/indeo3.c b/libavcodec/indeo3.c
index 533057d..eea27f9 100644
--- a/libavcodec/indeo3.c
+++ b/libavcodec/indeo3.c
@@ -147,7 +147,7 @@ static av_cold void iv_free_func(Indeo3DecodeContext *s)
     av_free(s->corrector_type);
 }
 
-typedef struct {
+struct ustr {
     long xpos;
     long ypos;
     long width;
@@ -155,7 +155,7 @@ typedef struct {
     long split_flag;
     long split_direction;
     long usl7;
-} ustr_t;
+};
 
 
 #define LV1_CHECK(buf1,rle_v3,lv1,lp2)  \
@@ -213,7 +213,7 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
     uint32_t *cur_lp, *ref_lp;
     const uint32_t *correction_lp[2], *correctionloworder_lp[2], *correctionhighorder_lp[2];
     uint8_t *correction_type_sp[2];
-    ustr_t strip_tbl[20], *strip;
+    struct ustr strip_tbl[20], *strip;
     int i, j, k, lp1, lp2, flag1, cmd, blks_width, blks_height, region_160_width,
         rle_v1, rle_v2, rle_v3;
     unsigned short res;
@@ -252,14 +252,14 @@ static void iv_Decode_Chunk(Indeo3DecodeContext *s,
 
         if(cmd == 0) {
             strip++;
-            memcpy(strip, strip-1, sizeof(ustr_t));
+            memcpy(strip, strip-1, sizeof(*strip));
             strip->split_flag = 1;
             strip->split_direction = 0;
             strip->height = (strip->height > 8 ? ((strip->height+8)>>4)<<3 : 4);
             continue;
         } else if(cmd == 1) {
             strip++;
-            memcpy(strip, strip-1, sizeof(ustr_t));
+            memcpy(strip, strip-1, sizeof(*strip));
             strip->split_flag = 1;
             strip->split_direction = 1;
             strip->width = (strip->width > 8 ? ((strip->width+8)>>4)<<3 : 4);
diff --git a/libavcodec/internal.h b/libavcodec/internal.h
new file mode 100644
index 0000000..ec954a4
--- /dev/null
+++ b/libavcodec/internal.h
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file internal.h
+ * common internal api header.
+ */
+
+#ifndef AVCODEC_INTERNAL_H
+#define AVCODEC_INTERNAL_H
+
+/**
+ * Logs a generic warning message about a missing feature.
+ * @param[in] avc a pointer to an arbitrary struct of which the first field is
+ * a pointer to an AVClass struct
+ * @param[in] feature string containing the name of the missing feature
+ * @param[in] want_sample indicates if samples are wanted which exhibit this feature.
+ * If \p want_sample is non-zero, additional verbage will be added to the log
+ * message which tells the user how to report samples to the development
+ * mailing list.
+ */
+void ff_log_missing_feature(void *avc, const char *feature, int want_sample);
+
+#endif /* AVCODEC_INTERNAL_H */
diff --git a/libavcodec/intrax8.c b/libavcodec/intrax8.c
index 0436deb..1bca7f6 100644
--- a/libavcodec/intrax8.c
+++ b/libavcodec/intrax8.c
@@ -42,7 +42,7 @@ static VLC j_ac_vlc[2][2][8];  //[quant<13],[intra/inter],[select]
 static VLC j_dc_vlc[2][8];     //[quant], [select]
 static VLC j_orient_vlc[2][4]; //[quant], [select]
 
-static void x8_vlc_init(){
+static void x8_vlc_init(void){
     int i;
 
 #define  init_ac_vlc(dst,src) \
@@ -511,7 +511,7 @@ static int x8_decode_intra_mb(IntraX8Context* const w, const int chroma){
     int sign;
 
     assert(w->orient<12);
-    memset(s->block[0],0x00,64*sizeof(DCTELEM));
+    s->dsp.clear_block(s->block[0]);
 
     if(chroma){
         dc_mode=2;
diff --git a/libavcodec/libamr.c b/libavcodec/libamr.c
index 4f56e4d..79ca57f 100644
--- a/libavcodec/libamr.c
+++ b/libavcodec/libamr.c
@@ -81,9 +81,9 @@
 #include <amrnb/interf_enc.h>
 #endif
 
-static const char *nb_bitrate_unsupported =
+static const char nb_bitrate_unsupported[] =
     "bitrate not supported: use one of 4.75k, 5.15k, 5.9k, 6.7k, 7.4k, 7.95k, 10.2k or 12.2k\n";
-static const char *wb_bitrate_unsupported =
+static const char wb_bitrate_unsupported[] =
     "bitrate not supported: use one of 6.6k, 8.85k, 12.65k, 14.25k, 15.85k, 18.25k, 19.85k, 23.05k, or 23.85k\n";
 
 /* Common code for fixed and float version*/
@@ -245,10 +245,10 @@ static int amr_nb_decode_close(AVCodecContext * avctx)
 
 static int amr_nb_decode_frame(AVCodecContext * avctx,
             void *data, int *data_size,
-            uint8_t * buf, int buf_size)
+            const uint8_t * buf, int buf_size)
 {
     AMRContext *s = avctx->priv_data;
-    uint8_t*amrData=buf;
+    const uint8_t*amrData=buf;
     int offset=0;
     UWord8 toc, q, ft;
     Word16 serial[SERIAL_FRAMESIZE];   /* coded bits */
@@ -441,10 +441,10 @@ static int amr_nb_encode_close(AVCodecContext * avctx)
 
 static int amr_nb_decode_frame(AVCodecContext * avctx,
             void *data, int *data_size,
-            uint8_t * buf, int buf_size)
+            const uint8_t * buf, int buf_size)
 {
     AMRContext *s = avctx->priv_data;
-    uint8_t*amrData=buf;
+    const uint8_t*amrData=buf;
     static const uint8_t block_size[16]={ 12, 13, 15, 17, 19, 20, 26, 31, 5, 0, 0, 0, 0, 0, 0, 0 };
     enum Mode dec_mode;
     int packet_size;
@@ -654,10 +654,10 @@ static int amr_wb_decode_init(AVCodecContext * avctx)
 
 static int amr_wb_decode_frame(AVCodecContext * avctx,
             void *data, int *data_size,
-            uint8_t * buf, int buf_size)
+            const uint8_t * buf, int buf_size)
 {
     AMRWBContext *s = avctx->priv_data;
-    uint8_t*amrData=buf;
+    const uint8_t*amrData=buf;
     int mode;
     int packet_size;
     static const uint8_t block_size[16] = {18, 23, 33, 37, 41, 47, 51, 59, 61, 6, 6, 0, 0, 0, 1, 1};
diff --git a/libavcodec/libdiracdec.c b/libavcodec/libdiracdec.c
index 06830f7..6c6e467 100644
--- a/libavcodec/libdiracdec.c
+++ b/libavcodec/libdiracdec.c
@@ -88,10 +88,12 @@ static int libdirac_decode_frame(AVCodecContext *avccontext,
 
     *data_size = 0;
 
-    if (buf_size>0)
+    if (buf_size>0) {
         /* set data to decode into buffer */
         dirac_buffer (p_dirac_params->p_decoder, buf, buf+buf_size);
-
+        if ((buf[4] &0x08) == 0x08 && (buf[4] & 0x03))
+            avccontext->has_b_frames = 1;
+    }
     while (1) {
          /* parse data and process result */
         DecoderState state = dirac_parse (p_dirac_params->p_decoder);
diff --git a/libavcodec/libschroedingerdec.c b/libavcodec/libschroedingerdec.c
index f9b4d21..36cba01 100644
--- a/libavcodec/libschroedingerdec.c
+++ b/libavcodec/libschroedingerdec.c
@@ -235,6 +235,9 @@ static int libschroedinger_decode_frame(AVCodecContext *avccontext,
     do {
         if ((enc_buf = FfmpegFindNextSchroParseUnit(&parse_ctx))) {
             /* Push buffer into decoder. */
+            if (SCHRO_PARSE_CODE_IS_PICTURE(enc_buf->data[4]) &&
+                SCHRO_PARSE_CODE_NUM_REFS(enc_buf->data[4]) > 0)
+                avccontext->has_b_frames = 1;
             state = schro_decoder_push (decoder, enc_buf);
             if (state == SCHRO_DECODER_FIRST_ACCESS_UNIT)
                   libschroedinger_handle_first_access_unit(avccontext);
diff --git a/libavcodec/libxvidff.c b/libavcodec/libxvidff.c
index 36f27c7..3149459 100644
--- a/libavcodec/libxvidff.c
+++ b/libavcodec/libxvidff.c
@@ -38,13 +38,13 @@
 #define BUFFER_CAT(x)               (&((x)[strlen(x)]))
 
 /* For PPC Use */
-extern int has_altivec(void);
+int has_altivec(void);
 
 /**
  * Structure for the private Xvid context.
  * This stores all the private context for the codec.
  */
-typedef struct xvid_context {
+struct xvid_context {
     void *encoder_handle;          /** Handle for Xvid encoder */
     int xsize, ysize;              /** Frame size */
     int vop_flags;                 /** VOP flags for Xvid encoder */
@@ -58,15 +58,15 @@ typedef struct xvid_context {
     char *twopassfile;             /** second pass temp file name */
     unsigned char *intra_matrix;   /** P-Frame Quant Matrix */
     unsigned char *inter_matrix;   /** I-Frame Quant Matrix */
-} xvid_context_t;
+};
 
 /**
  * Structure for the private first-pass plugin.
  */
-typedef struct xvid_ff_pass1 {
+struct xvid_ff_pass1 {
     int     version;                /** Xvid version */
-    xvid_context_t *context;        /** Pointer to private context */
-} xvid_ff_pass1_t;
+    struct xvid_context *context;        /** Pointer to private context */
+};
 
 /* Prototypes - See function implementation for details */
 int xvid_strip_vol_header(AVCodecContext *avctx, unsigned char *frame, unsigned int header_len, unsigned int frame_len);
@@ -84,12 +84,12 @@ void xvid_correct_framerate(AVCodecContext *avctx);
 av_cold int ff_xvid_encode_init(AVCodecContext *avctx)  {
     int xerr, i;
     int xvid_flags = avctx->flags;
-    xvid_context_t *x = avctx->priv_data;
+    struct xvid_context *x = avctx->priv_data;
     uint16_t *intra, *inter;
     int fd;
 
     xvid_plugin_single_t single;
-    xvid_ff_pass1_t rc2pass1;
+    struct xvid_ff_pass1 rc2pass1;
     xvid_plugin_2pass2_t rc2pass2;
     xvid_gbl_init_t xvid_gbl_init;
     xvid_enc_create_t xvid_enc_create;
@@ -166,7 +166,7 @@ av_cold int ff_xvid_encode_init(AVCodecContext *avctx)  {
     xvid_gbl_init.version = XVID_VERSION;
     xvid_gbl_init.debug = 0;
 
-#ifdef ARCH_POWERPC
+#ifdef ARCH_PPC
     /* Xvid's PPC support is borked, use libavcodec to detect */
 #ifdef HAVE_ALTIVEC
     if( has_altivec() ) {
@@ -208,7 +208,7 @@ av_cold int ff_xvid_encode_init(AVCodecContext *avctx)  {
     x->twopassfile = NULL;
 
     if( xvid_flags & CODEC_FLAG_PASS1 ) {
-        memset(&rc2pass1, 0, sizeof(xvid_ff_pass1_t));
+        memset(&rc2pass1, 0, sizeof(struct xvid_ff_pass1));
         rc2pass1.version = XVID_VERSION;
         rc2pass1.context = x;
         x->twopassbuffer = av_malloc(BUFFER_SIZE);
@@ -370,7 +370,7 @@ int ff_xvid_encode_frame(AVCodecContext *avctx,
                          unsigned char *frame, int buf_size, void *data) {
     int xerr, i;
     char *tmp;
-    xvid_context_t *x = avctx->priv_data;
+    struct xvid_context *x = avctx->priv_data;
     AVFrame *picture = data;
     AVFrame *p = &(x->encoded_picture);
 
@@ -475,7 +475,7 @@ int ff_xvid_encode_frame(AVCodecContext *avctx,
  * @return Returns 0, success guaranteed
  */
 av_cold int ff_xvid_encode_close(AVCodecContext *avctx) {
-    xvid_context_t *x = avctx->priv_data;
+    struct xvid_context *x = avctx->priv_data;
 
     xvid_encore(x->encoder_handle, XVID_ENC_DESTROY, NULL, NULL);
 
@@ -616,7 +616,7 @@ void xvid_correct_framerate(AVCodecContext *avctx) {
  */
 static int xvid_ff_2pass_create(xvid_plg_create_t * param,
                                 void ** handle) {
-    xvid_ff_pass1_t *x = (xvid_ff_pass1_t *)param->param;
+    struct xvid_ff_pass1 *x = (struct xvid_ff_pass1 *)param->param;
     char *log = x->context->twopassbuffer;
 
     /* Do a quick bounds check */
@@ -645,7 +645,7 @@ static int xvid_ff_2pass_create(xvid_plg_create_t * param,
  * @param param Destrooy context
  * @return Returns 0, success guaranteed
  */
-static int xvid_ff_2pass_destroy(xvid_context_t *ref,
+static int xvid_ff_2pass_destroy(struct xvid_context *ref,
                                 xvid_plg_destroy_t *param) {
     /* Currently cannot think of anything to do on destruction */
     /* Still, the framework should be here for reference/use */
@@ -661,7 +661,7 @@ static int xvid_ff_2pass_destroy(xvid_context_t *ref,
  * @param param Frame data
  * @return Returns 0, success guaranteed
  */
-static int xvid_ff_2pass_before(xvid_context_t *ref,
+static int xvid_ff_2pass_before(struct xvid_context *ref,
                                 xvid_plg_data_t *param) {
     int motion_remove;
     int motion_replacements;
@@ -704,7 +704,7 @@ static int xvid_ff_2pass_before(xvid_context_t *ref,
  * @param param Statistic data
  * @return Returns XVID_ERR_xxxx on failure, or 0 on success
  */
-static int xvid_ff_2pass_after(xvid_context_t *ref,
+static int xvid_ff_2pass_after(struct xvid_context *ref,
                                 xvid_plg_data_t *param) {
     char *log = ref->twopassbuffer;
     char *frame_types = " ipbs";
@@ -770,7 +770,7 @@ AVCodec libxvid_encoder = {
     "libxvid",
     CODEC_TYPE_VIDEO,
     CODEC_ID_XVID,
-    sizeof(xvid_context_t),
+    sizeof(struct xvid_context),
     ff_xvid_encode_init,
     ff_xvid_encode_frame,
     ff_xvid_encode_close,
diff --git a/libavcodec/lsp.c b/libavcodec/lsp.c
index d2785f7..f57f621 100644
--- a/libavcodec/lsp.c
+++ b/libavcodec/lsp.c
@@ -72,7 +72,7 @@ static void lsp2poly(int* f, const int16_t* lsp, int lp_half_order)
     {
         f[i] = f[i-2];
         for(j=i; j>1; j--)
-            f[j] -= MULL(f[j-1], lsp[2*i-2]) - f[j-2];
+            f[j] -= MULL(f[j-1], lsp[2*i-2], FRAC_BITS) - f[j-2];
 
         f[1] -= lsp[2*i-2] << 8;
     }
diff --git a/libavcodec/mathops.h b/libavcodec/mathops.h
index 123fcb7..07265c8 100644
--- a/libavcodec/mathops.h
+++ b/libavcodec/mathops.h
@@ -26,13 +26,13 @@
 
 #ifdef ARCH_X86_32
 
-#include "i386/mathops.h"
+#include "x86/mathops.h"
 
-#elif defined(ARCH_ARMV4L)
+#elif defined(ARCH_ARM)
 
-#include "armv4l/mathops.h"
+#include "arm/mathops.h"
 
-#elif defined(ARCH_POWERPC)
+#elif defined(ARCH_PPC)
 
 #include "ppc/mathops.h"
 
@@ -45,7 +45,7 @@
 /* generic implementation */
 
 #ifndef MULL
-#   define MULL(a,b) (((int64_t)(a) * (int64_t)(b)) >> FRAC_BITS)
+#   define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s))
 #endif
 
 #ifndef MULH
diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c
index b740275..e30b682 100644
--- a/libavcodec/mimic.c
+++ b/libavcodec/mimic.c
@@ -163,7 +163,7 @@ static int vlc_decode_block(MimicContext *ctx, int num_coeffs, int qscale)
     DCTELEM *block = ctx->dct_block;
     unsigned int pos;
 
-    memset(block, 0, 64 * sizeof(DCTELEM));
+    ctx->dsp.clear_block(block);
 
     block[0] = get_bits(&ctx->gb, 8) << 3;
 
diff --git a/libavcodec/mjpega_dump_header_bsf.c b/libavcodec/mjpega_dump_header_bsf.c
index a3f0131..fb27efb 100644
--- a/libavcodec/mjpega_dump_header_bsf.c
+++ b/libavcodec/mjpega_dump_header_bsf.c
@@ -35,6 +35,7 @@ static int mjpega_dump_header(AVBitStreamFilterContext *bsfc, AVCodecContext *av
                               const uint8_t *buf, int buf_size, int keyframe)
 {
     uint8_t *poutbufp;
+    unsigned dqt = 0, dht = 0, sof0 = 0;
     int i;
 
     if (avctx->codec_id != CODEC_ID_MJPEG) {
@@ -59,12 +60,13 @@ static int mjpega_dump_header(AVBitStreamFilterContext *bsfc, AVCodecContext *av
     for (i = 0; i < buf_size - 1; i++) {
         if (buf[i] == 0xff) {
             switch (buf[i + 1]) {
-            case DQT:  /* quant off */
-            case DHT:  /* huff  off */
-            case SOF0: /* image off */
-                bytestream_put_be32(&poutbufp, i + 46);
-                break;
+            case DQT:  dqt  = i + 46; break;
+            case DHT:  dht  = i + 46; break;
+            case SOF0: sof0 = i + 46; break;
             case SOS:
+                bytestream_put_be32(&poutbufp, dqt); /* quant off */
+                bytestream_put_be32(&poutbufp, dht); /* huff off */
+                bytestream_put_be32(&poutbufp, sof0); /* image off */
                 bytestream_put_be32(&poutbufp, i + 46); /* scan off */
                 bytestream_put_be32(&poutbufp, i + 46 + AV_RB16(buf + i + 2)); /* data off */
                 bytestream_put_buffer(&poutbufp, buf + 2, buf_size - 2); /* skip already written SOI */
diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c
index e8a34ef..5dcdb73 100644
--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -351,9 +351,17 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s)
 
     /* totally blank picture as progressive JPEG will only add details to it */
     if(s->progressive){
-        memset(s->picture.data[0], 0, s->picture.linesize[0] * s->height);
-        memset(s->picture.data[1], 0, s->picture.linesize[1] * s->height >> (s->v_max - s->v_count[1]));
-        memset(s->picture.data[2], 0, s->picture.linesize[2] * s->height >> (s->v_max - s->v_count[2]));
+        int bw = (width  + s->h_max*8-1) / (s->h_max*8);
+        int bh = (height + s->v_max*8-1) / (s->v_max*8);
+        for(i=0; i<s->nb_components; i++) {
+            int size = bw * bh * s->h_count[i] * s->v_count[i];
+            av_freep(&s->blocks[i]);
+            av_freep(&s->last_nnz[i]);
+            s->blocks[i] = av_malloc(size * sizeof(**s->blocks));
+            s->last_nnz[i] = av_mallocz(size * sizeof(**s->last_nnz));
+            s->block_stride[i] = bw * s->h_count[i];
+        }
+        memset(s->coefs_finished, 0, sizeof(s->coefs_finished));
     }
     return 0;
 }
@@ -432,27 +440,29 @@ static int decode_block(MJpegDecodeContext *s, DCTELEM *block,
     return 0;
 }
 
+static int decode_dc_progressive(MJpegDecodeContext *s, DCTELEM *block, int component,
+                                 int dc_index, int16_t *quant_matrix, int Al)
+{
+    int val;
+    s->dsp.clear_block(block);
+    val = mjpeg_decode_dc(s, dc_index);
+    if (val == 0xffff) {
+        av_log(s->avctx, AV_LOG_ERROR, "error dc\n");
+        return -1;
+    }
+    val = (val * quant_matrix[0] << Al) + s->last_dc[component];
+    s->last_dc[component] = val;
+    block[0] = val;
+    return 0;
+}
+
 /* decode block and dequantize - progressive JPEG version */
-static int decode_block_progressive(MJpegDecodeContext *s, DCTELEM *block,
-                        int component, int dc_index, int ac_index, int16_t *quant_matrix,
-                        int ss, int se, int Ah, int Al, int *EOBRUN)
+static int decode_block_progressive(MJpegDecodeContext *s, DCTELEM *block, uint8_t *last_nnz,
+                                    int ac_index, int16_t *quant_matrix,
+                                    int ss, int se, int Al, int *EOBRUN)
 {
     int code, i, j, level, val, run;
 
-    /* DC coef */
-    if(!ss){
-        val = mjpeg_decode_dc(s, dc_index);
-        if (val == 0xffff) {
-            av_log(s->avctx, AV_LOG_ERROR, "error dc\n");
-            return -1;
-        }
-        val = (val * quant_matrix[0] << Al) + s->last_dc[component];
-    }else
-        val = 0;
-    s->last_dc[component] = val;
-    block[0] = val;
-    if(!se) return 0;
-    /* AC coefs */
     if(*EOBRUN){
         (*EOBRUN)--;
         return 0;
@@ -505,9 +515,100 @@ static int decode_block_progressive(MJpegDecodeContext *s, DCTELEM *block,
         }
     }
     CLOSE_READER(re, &s->gb)}
+    if(i > *last_nnz)
+        *last_nnz = i;
+    return 0;
+}
+
+#define REFINE_BIT(j) {\
+    UPDATE_CACHE(re, &s->gb);\
+    sign = block[j]>>15;\
+    block[j] += SHOW_UBITS(re, &s->gb, 1) * ((quant_matrix[j]^sign)-sign) << Al;\
+    LAST_SKIP_BITS(re, &s->gb, 1);\
+}
+
+#define ZERO_RUN \
+for(;;i++) {\
+    if(i > last) {\
+        i += run;\
+        if(i > se) {\
+            av_log(s->avctx, AV_LOG_ERROR, "error count: %d\n", i);\
+            return -1;\
+        }\
+        break;\
+    }\
+    j = s->scantable.permutated[i];\
+    if(block[j])\
+        REFINE_BIT(j)\
+    else if(run-- == 0)\
+        break;\
+}
+
+/* decode block and dequantize - progressive JPEG refinement pass */
+static int decode_block_refinement(MJpegDecodeContext *s, DCTELEM *block, uint8_t *last_nnz,
+                        int ac_index, int16_t *quant_matrix,
+                        int ss, int se, int Al, int *EOBRUN)
+{
+    int code, i=ss, j, sign, val, run;
+    int last = FFMIN(se, *last_nnz);
+
+    OPEN_READER(re, &s->gb);
+    if(*EOBRUN)
+        (*EOBRUN)--;
+    else {
+        for(;;i++) {
+            UPDATE_CACHE(re, &s->gb);
+            GET_VLC(code, re, &s->gb, s->vlcs[1][ac_index].table, 9, 2)
+            /* Progressive JPEG use AC coeffs from zero and this decoder sets offset 16 by default */
+            code -= 16;
+            if(code & 0xF) {
+                run = ((unsigned) code) >> 4;
+                UPDATE_CACHE(re, &s->gb);
+                val = SHOW_UBITS(re, &s->gb, 1);
+                LAST_SKIP_BITS(re, &s->gb, 1);
+                ZERO_RUN;
+                j = s->scantable.permutated[i];
+                val--;
+                block[j] = ((quant_matrix[j]^val)-val) << Al;
+                if(i == se) {
+                    if(i > *last_nnz)
+                        *last_nnz = i;
+                    CLOSE_READER(re, &s->gb)
+                    return 0;
+                }
+            }else{
+                run = ((unsigned) code) >> 4;
+                if(run == 0xF){
+                    ZERO_RUN;
+                }else{
+                    val = run;
+                    run = (1 << run);
+                    if(val) {
+                        UPDATE_CACHE(re, &s->gb);
+                        run += SHOW_UBITS(re, &s->gb, val);
+                        LAST_SKIP_BITS(re, &s->gb, val);
+                    }
+                    *EOBRUN = run - 1;
+                    break;
+                }
+            }
+        }
+
+        if(i > *last_nnz)
+            *last_nnz = i;
+    }
+
+    for(;i<=last;i++) {
+        j = s->scantable.permutated[i];
+        if(block[j])
+            REFINE_BIT(j)
+    }
+    CLOSE_READER(re, &s->gb);
 
     return 0;
 }
+#undef REFINE_BIT
+#undef ZERO_RUN
 
 static int ljpeg_decode_rgb_scan(MJpegDecodeContext *s, int predictor, int point_transform){
     int i, mb_x, mb_y;
@@ -660,18 +761,16 @@ static int ljpeg_decode_yuv_scan(MJpegDecodeContext *s, int predictor, int point
     return 0;
 }
 
-static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int ss, int se, int Ah, int Al){
+static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah, int Al){
     int i, mb_x, mb_y;
-    int EOBRUN = 0;
     uint8_t* data[MAX_COMPONENTS];
     int linesize[MAX_COMPONENTS];
 
-    if(Ah) return 0; /* TODO decode refinement planes too */
-
     for(i=0; i < nb_components; i++) {
         int c = s->comp_index[i];
         data[c] = s->picture.data[c];
         linesize[c]=s->linesize[c];
+        s->coefs_finished[c] |= 1;
         if(s->avctx->codec->id==CODEC_ID_AMV) {
             //picture should be flipped upside-down for this codec
             assert(!(s->avctx->flags & CODEC_FLAG_EMU_EDGE));
@@ -695,30 +794,32 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int ss, i
                 x = 0;
                 y = 0;
                 for(j=0;j<n;j++) {
-                    memset(s->block, 0, sizeof(s->block));
-                    if (!s->progressive && decode_block(s, s->block, i,
-                                     s->dc_index[i], s->ac_index[i],
-                                     s->quant_matrixes[ s->quant_index[c] ]) < 0) {
-                        av_log(s->avctx, AV_LOG_ERROR, "error y=%d x=%d\n", mb_y, mb_x);
-                        return -1;
-                    }
-                    if (s->progressive && decode_block_progressive(s, s->block, i,
-                                     s->dc_index[i], s->ac_index[i],
-                                     s->quant_matrixes[ s->quant_index[c] ], ss, se, Ah, Al, &EOBRUN) < 0) {
-                        av_log(s->avctx, AV_LOG_ERROR, "error y=%d x=%d\n", mb_y, mb_x);
-                        return -1;
-                    }
-//                    av_log(s->avctx, AV_LOG_DEBUG, "mb: %d %d processed\n", mb_y, mb_x);
                     ptr = data[c] +
                         (((linesize[c] * (v * mb_y + y) * 8) +
                         (h * mb_x + x) * 8) >> s->avctx->lowres);
-                    if (s->interlaced && s->bottom_field)
+                    if(s->interlaced && s->bottom_field)
                         ptr += linesize[c] >> 1;
-//av_log(NULL, AV_LOG_DEBUG, "%d %d %d %d %d %d %d %d \n", mb_x, mb_y, x, y, c, s->bottom_field, (v * mb_y + y) * 8, (h * mb_x + x) * 8);
-                    if(!s->progressive)
+                    if(!s->progressive) {
+                        s->dsp.clear_block(s->block);
+                        if(decode_block(s, s->block, i,
+                                     s->dc_index[i], s->ac_index[i],
+                                     s->quant_matrixes[ s->quant_index[c] ]) < 0) {
+                            av_log(s->avctx, AV_LOG_ERROR, "error y=%d x=%d\n", mb_y, mb_x);
+                            return -1;
+                        }
                         s->dsp.idct_put(ptr, linesize[c], s->block);
-                    else
-                        s->dsp.idct_add(ptr, linesize[c], s->block);
+                    } else {
+                        int block_idx = s->block_stride[c] * (v * mb_y + y) + (h * mb_x + x);
+                        DCTELEM *block = s->blocks[c][block_idx];
+                        if(Ah)
+                            block[0] += get_bits1(&s->gb) * s->quant_matrixes[ s->quant_index[c] ][0] << Al;
+                        else if(decode_dc_progressive(s, block, i, s->dc_index[i], s->quant_matrixes[ s->quant_index[c] ], Al) < 0) {
+                            av_log(s->avctx, AV_LOG_ERROR, "error y=%d x=%d\n", mb_y, mb_x);
+                            return -1;
+                        }
+                    }
+//                    av_log(s->avctx, AV_LOG_DEBUG, "mb: %d %d processed\n", mb_y, mb_x);
+//av_log(NULL, AV_LOG_DEBUG, "%d %d %d %d %d %d %d %d \n", mb_x, mb_y, x, y, c, s->bottom_field, (v * mb_y + y) * 8, (h * mb_x + x) * 8);
                     if (++x == h) {
                         x = 0;
                         y++;
@@ -738,6 +839,49 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int ss, i
     return 0;
 }
 
+static int mjpeg_decode_scan_progressive_ac(MJpegDecodeContext *s, int ss, int se, int Ah, int Al){
+    int mb_x, mb_y;
+    int EOBRUN = 0;
+    int c = s->comp_index[0];
+    uint8_t* data = s->picture.data[c];
+    int linesize = s->linesize[c];
+    int last_scan = 0;
+    int16_t *quant_matrix = s->quant_matrixes[ s->quant_index[c] ];
+
+    if(!Al) {
+        s->coefs_finished[c] |= (1LL<<(se+1))-(1LL<<ss);
+        last_scan = !~s->coefs_finished[c];
+    }
+
+    if(s->interlaced && s->bottom_field)
+        data += linesize >> 1;
+
+    for(mb_y = 0; mb_y < s->mb_height; mb_y++) {
+        uint8_t *ptr = data + (mb_y*linesize*8 >> s->avctx->lowres);
+        int block_idx = mb_y * s->block_stride[c];
+        DCTELEM (*block)[64] = &s->blocks[c][block_idx];
+        uint8_t *last_nnz = &s->last_nnz[c][block_idx];
+        for(mb_x = 0; mb_x < s->mb_width; mb_x++, block++, last_nnz++) {
+            int ret;
+            if(Ah)
+                ret = decode_block_refinement(s, *block, last_nnz, s->ac_index[0],
+                                              quant_matrix, ss, se, Al, &EOBRUN);
+            else
+                ret = decode_block_progressive(s, *block, last_nnz, s->ac_index[0],
+                                               quant_matrix, ss, se, Al, &EOBRUN);
+            if(ret < 0) {
+                av_log(s->avctx, AV_LOG_ERROR, "error y=%d x=%d\n", mb_y, mb_x);
+                return -1;
+            }
+            if(last_scan) {
+                s->dsp.idct_put(ptr, linesize, *block);
+                ptr += 8 >> s->avctx->lowres;
+            }
+        }
+    }
+    return 0;
+}
+
 int ff_mjpeg_decode_sos(MJpegDecodeContext *s)
 {
     int len, nb_components, i, h, v, predictor, point_transform;
@@ -849,8 +993,13 @@ int ff_mjpeg_decode_sos(MJpegDecodeContext *s)
             }
         }
     }else{
-        if(mjpeg_decode_scan(s, nb_components, predictor, ilv, prev_shift, point_transform) < 0)
-            return -1;
+        if(s->progressive && predictor) {
+            if(mjpeg_decode_scan_progressive_ac(s, predictor, ilv, prev_shift, point_transform) < 0)
+                return -1;
+        } else {
+            if(mjpeg_decode_scan(s, nb_components, prev_shift, point_transform) < 0)
+                return -1;
+        }
     }
     emms_c();
     return 0;
@@ -1354,6 +1503,10 @@ av_cold int ff_mjpeg_decode_end(AVCodecContext *avctx)
         for(j=0;j<4;j++)
             free_vlc(&s->vlcs[i][j]);
     }
+    for(i=0; i<MAX_COMPONENTS; i++) {
+        av_freep(&s->blocks[i]);
+        av_freep(&s->last_nnz[i]);
+    }
     return 0;
 }
 
diff --git a/libavcodec/mjpegdec.h b/libavcodec/mjpegdec.h
index c973a9c..e90d864 100644
--- a/libavcodec/mjpegdec.h
+++ b/libavcodec/mjpegdec.h
@@ -67,6 +67,7 @@ typedef struct MJpegDecodeContext {
     int width, height;
     int mb_width, mb_height;
     int nb_components;
+    int block_stride[MAX_COMPONENTS];
     int component_id[MAX_COMPONENTS];
     int h_count[MAX_COMPONENTS]; /* horizontal and vertical count for each component */
     int v_count[MAX_COMPONENTS];
@@ -83,6 +84,9 @@ typedef struct MJpegDecodeContext {
     int linesize[MAX_COMPONENTS];                   ///< linesize << interlaced
     int8_t *qscale_table;
     DECLARE_ALIGNED_16(DCTELEM, block[64]);
+    DCTELEM (*blocks[MAX_COMPONENTS])[64]; ///< intermediate sums (progressive mode)
+    uint8_t *last_nnz[MAX_COMPONENTS];
+    uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode)
     ScanTable scantable;
     DSPContext dsp;
 
diff --git a/libavcodec/mlp.c b/libavcodec/mlp.c
index 2ac2e65..7ef7f97 100644
--- a/libavcodec/mlp.c
+++ b/libavcodec/mlp.c
@@ -43,26 +43,14 @@ const uint8_t ff_mlp_huffman_tables[3][18][2] = {
 static int crc_init = 0;
 static AVCRC crc_63[1024];
 static AVCRC crc_1D[1024];
-
-
-static int crc_init_2D = 0;
 static AVCRC crc_2D[1024];
 
-int av_cold ff_mlp_init_crc2D(AVCodecParserContext *s)
-{
-    if (!crc_init_2D) {
-        av_crc_init(crc_2D, 0, 16, 0x002D, sizeof(crc_2D));
-        crc_init_2D = 1;
-    }
-
-    return 0;
-}
-
 void av_cold ff_mlp_init_crc()
 {
     if (!crc_init) {
         av_crc_init(crc_63, 0,  8,   0x63, sizeof(crc_63));
         av_crc_init(crc_1D, 0,  8,   0x1D, sizeof(crc_1D));
+        av_crc_init(crc_2D, 0, 16, 0x002D, sizeof(crc_2D));
         crc_init = 1;
     }
 }
diff --git a/libavcodec/mlp.h b/libavcodec/mlp.h
index 338a7a3..b81c735 100644
--- a/libavcodec/mlp.h
+++ b/libavcodec/mlp.h
@@ -106,8 +106,6 @@ uint8_t ff_mlp_restart_checksum(const uint8_t *buf, unsigned int bit_size);
  */
 uint8_t ff_mlp_calculate_parity(const uint8_t *buf, unsigned int buf_size);
 
-int ff_mlp_init_crc2D(AVCodecParserContext *s);
-
 void ff_mlp_init_crc();
 
 /** XOR four bytes into one. */
diff --git a/libavcodec/mlp_parser.c b/libavcodec/mlp_parser.c
index 4400d71..925ed86 100644
--- a/libavcodec/mlp_parser.c
+++ b/libavcodec/mlp_parser.c
@@ -150,6 +150,12 @@ typedef struct MLPParseContext
     int num_substreams;
 } MLPParseContext;
 
+static av_cold int mlp_init(AVCodecParserContext *s)
+{
+    ff_mlp_init_crc();
+    return 0;
+}
+
 static int mlp_parse(AVCodecParserContext *s,
                      AVCodecContext *avctx,
                      const uint8_t **poutbuf, int *poutbuf_size,
@@ -245,11 +251,11 @@ static int mlp_parse(AVCodecParserContext *s,
         if (ff_mlp_read_major_sync(avctx, &mh, &gb) < 0)
             goto lost_sync;
 
-#ifdef CONFIG_AUDIO_NONSHORT
-        avctx->bits_per_sample = mh.group1_bits;
-        if (avctx->bits_per_sample > 16)
+        avctx->bits_per_raw_sample = mh.group1_bits;
+        if (avctx->bits_per_raw_sample > 16)
             avctx->sample_fmt = SAMPLE_FMT_S32;
-#endif
+        else
+            avctx->sample_fmt = SAMPLE_FMT_S16;
         avctx->sample_rate = mh.group1_samplerate;
         avctx->frame_size = mh.access_unit_size;
 
@@ -283,7 +289,7 @@ lost_sync:
 AVCodecParser mlp_parser = {
     { CODEC_ID_MLP },
     sizeof(MLPParseContext),
-    ff_mlp_init_crc2D,
+    mlp_init,
     mlp_parse,
     NULL,
 };
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index 7272458..f4323da 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -40,7 +40,7 @@
 
 static const char* sample_message =
     "Please file a bug report following the instructions at "
-    "http://ffmpeg.mplayerhq.hu/bugreports.html and include "
+    "http://ffmpeg.org/bugreports.html and include "
     "a sample of this file.";
 
 typedef struct SubStream {
@@ -222,7 +222,7 @@ static av_cold int mlp_decode_init(AVCodecContext *avctx)
     m->avctx = avctx;
     for (substr = 0; substr < MAX_SUBSTREAMS; substr++)
         m->substream[substr].lossless_check_data = 0xffffffff;
-    avctx->sample_fmt = SAMPLE_FMT_S16;
+
     return 0;
 }
 
@@ -296,12 +296,11 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
     m->avctx->sample_rate    = mh.group1_samplerate;
     m->avctx->frame_size     = mh.access_unit_size;
 
-#ifdef CONFIG_AUDIO_NONSHORT
-    m->avctx->bits_per_sample = mh.group1_bits;
-    if (mh.group1_bits > 16) {
+    m->avctx->bits_per_raw_sample = mh.group1_bits;
+    if (mh.group1_bits > 16)
         m->avctx->sample_fmt = SAMPLE_FMT_S32;
-    }
-#endif
+    else
+        m->avctx->sample_fmt = SAMPLE_FMT_S16;
 
     m->params_valid = 1;
     for (substr = 0; substr < MAX_SUBSTREAMS; substr++)
diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c
index 7c1fc67..d9d48a6 100644
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -307,8 +307,6 @@ int ff_init_me(MpegEncContext *s){
         c->sub_motion_search= no_sub_motion_search;
     }
 
-    c->temp= c->scratchpad;
-
     return 0;
 }
 
diff --git a/libavcodec/mpc.h b/libavcodec/mpc.h
index 3040135..038c34f 100644
--- a/libavcodec/mpc.h
+++ b/libavcodec/mpc.h
@@ -74,7 +74,7 @@ typedef struct {
     DECLARE_ALIGNED_16(int32_t, sb_samples[MPA_MAX_CHANNELS][36][SBLIMIT]);
 } MPCContext;
 
-extern void ff_mpc_init();
-extern void ff_mpc_dequantize_and_synth(MPCContext *c, int maxband, void *dst);
+void ff_mpc_init();
+void ff_mpc_dequantize_and_synth(MPCContext *c, int maxband, void *dst);
 
 #endif /* AVCODEC_MPC_H */
diff --git a/libavcodec/mpc7.c b/libavcodec/mpc7.c
index 565b858..2db4172 100644
--- a/libavcodec/mpc7.c
+++ b/libavcodec/mpc7.c
@@ -109,6 +109,7 @@ static av_cold int mpc7_decode_init(AVCodecContext * avctx)
     }
     vlc_initialized = 1;
     avctx->sample_fmt = SAMPLE_FMT_S16;
+    avctx->channel_layout = (avctx->channels==2) ? CH_LAYOUT_STEREO : CH_LAYOUT_MONO;
     return 0;
 }
 
diff --git a/libavcodec/mpc8.c b/libavcodec/mpc8.c
index 0d4f128..c47e74d 100644
--- a/libavcodec/mpc8.c
+++ b/libavcodec/mpc8.c
@@ -178,6 +178,7 @@ static av_cold int mpc8_decode_init(AVCodecContext * avctx)
     }
     vlc_initialized = 1;
     avctx->sample_fmt = SAMPLE_FMT_S16;
+    avctx->channel_layout = (avctx->channels==2) ? CH_LAYOUT_STEREO : CH_LAYOUT_MONO;
     return 0;
 }
 
diff --git a/libavcodec/mpeg12.c b/libavcodec/mpeg12.c
index 8b6a93e..52ab50b 100644
--- a/libavcodec/mpeg12.c
+++ b/libavcodec/mpeg12.c
@@ -63,10 +63,10 @@ static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s, DCTELEM *bloc
 static int mpeg_decode_motion(MpegEncContext *s, int fcode, int pred);
 static void exchange_uv(MpegEncContext *s);
 
-extern int XVMC_field_start(MpegEncContext *s, AVCodecContext *avctx);
-extern int XVMC_field_end(MpegEncContext *s);
-extern void XVMC_pack_pblocks(MpegEncContext *s,int cbp);
-extern void XVMC_init_block(MpegEncContext *s);//set s->block
+int XVMC_field_start(MpegEncContext *s, AVCodecContext *avctx);
+int XVMC_field_end(MpegEncContext *s);
+void XVMC_pack_pblocks(MpegEncContext *s,int cbp);
+void XVMC_init_block(MpegEncContext *s);//set s->block
 
 static const enum PixelFormat pixfmt_xvmc_mpg2_420[] = {
                                            PIX_FMT_XVMC_MPEG2_IDCT,
@@ -303,7 +303,7 @@ static int mpeg_decode_mb(MpegEncContext *s,
         }else
             memset(s->last_mv, 0, sizeof(s->last_mv)); /* reset mv prediction */
         s->mb_intra = 1;
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
         //if 1, we memcpy blocks in xvmcvideo
         if(s->avctx->xvmc_acceleration > 1){
             XVMC_pack_pblocks(s,-1);//inter are always full blocks
@@ -516,7 +516,7 @@ static int mpeg_decode_mb(MpegEncContext *s,
                 return -1;
             }
 
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
             //if 1, we memcpy blocks in xvmcvideo
             if(s->avctx->xvmc_acceleration > 1){
                 XVMC_pack_pblocks(s,cbp);
@@ -1212,6 +1212,22 @@ static void quant_matrix_rebuild(uint16_t *matrix, const uint8_t *old_perm,
     }
 }
 
+static enum PixelFormat mpeg_get_pixelformat(AVCodecContext *avctx){
+    Mpeg1Context *s1 = avctx->priv_data;
+    MpegEncContext *s = &s1->mpeg_enc_ctx;
+
+    if(avctx->xvmc_acceleration)
+        return avctx->get_format(avctx,pixfmt_xvmc_mpg2_420);
+    else{
+        if(s->chroma_format <  2)
+            return PIX_FMT_YUV420P;
+        else if(s->chroma_format == 2)
+            return PIX_FMT_YUV422P;
+        else
+            return PIX_FMT_YUV444P;
+    }
+}
+
 /* Call this function when we know all parameters.
  * It may be called in different places for MPEG-1 and MPEG-2. */
 static int mpeg_decode_postinit(AVCodecContext *avctx){
@@ -1288,19 +1304,7 @@ static int mpeg_decode_postinit(AVCodecContext *avctx){
             }
         }//MPEG-2
 
-        if(avctx->xvmc_acceleration){
-            avctx->pix_fmt = avctx->get_format(avctx,pixfmt_xvmc_mpg2_420);
-        }else{
-            if(s->chroma_format <  2){
-                avctx->pix_fmt = PIX_FMT_YUV420P;
-            }else
-            if(s->chroma_format == 2){
-                avctx->pix_fmt = PIX_FMT_YUV422P;
-            }else
-            if(s->chroma_format >  2){
-                avctx->pix_fmt = PIX_FMT_YUV444P;
-            }
-        }
+        avctx->pix_fmt = mpeg_get_pixelformat(avctx);
         //until then pix_fmt may be changed right after codec init
         if( avctx->pix_fmt == PIX_FMT_XVMC_MPEG2_IDCT )
             if( avctx->idct_algo == FF_IDCT_AUTO )
@@ -1639,7 +1643,7 @@ static int mpeg_field_start(MpegEncContext *s){
                 }
             }
     }
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
 // MPV_frame_start will call this function too,
 // but we need to call it on every field
     if(s->avctx->xvmc_acceleration)
@@ -1730,7 +1734,7 @@ static int mpeg_decode_slice(Mpeg1Context *s1, int mb_y,
     }
 
     for(;;) {
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
         //If 1, we memcpy blocks in xvmcvideo.
         if(s->avctx->xvmc_acceleration > 1)
             XVMC_init_block(s);//set s->block
@@ -1912,7 +1916,7 @@ static int slice_end(AVCodecContext *avctx, AVFrame *pict)
     if (!s1->mpeg_enc_ctx_allocated || !s->current_picture_ptr)
         return 0;
 
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
     if(s->avctx->xvmc_acceleration)
         XVMC_field_end(s);
 #endif
@@ -2069,11 +2073,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     avctx->has_b_frames= 0; //true?
     s->low_delay= 1;
 
-    if(avctx->xvmc_acceleration){
-        avctx->pix_fmt = avctx->get_format(avctx,pixfmt_xvmc_mpg2_420);
-    }else{
-        avctx->pix_fmt = PIX_FMT_YUV420P;
-    }
+    avctx->pix_fmt = mpeg_get_pixelformat(avctx);
 
     if( avctx->pix_fmt == PIX_FMT_XVMC_MPEG2_IDCT )
         if( avctx->idct_algo == FF_IDCT_AUTO )
@@ -2472,7 +2472,7 @@ AVCodec mpegvideo_decoder = {
     .long_name= NULL_IF_CONFIG_SMALL("MPEG-1 video"),
 };
 
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
 static av_cold int mpeg_mc_decode_init(AVCodecContext *avctx){
     Mpeg1Context *s;
 
diff --git a/libavcodec/mpegaudio.h b/libavcodec/mpegaudio.h
index 21de2da..33af18d 100644
--- a/libavcodec/mpegaudio.h
+++ b/libavcodec/mpegaudio.h
@@ -128,7 +128,7 @@ typedef struct HuffTable {
 } HuffTable;
 
 int ff_mpa_l2_select_table(int bitrate, int nb_channels, int freq, int lsf);
-int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate);
+int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bitrate);
 void ff_mpa_synth_init(MPA_INT *window);
 void ff_mpa_synth_filter(MPA_INT *synth_buf_ptr, int *synth_buf_offset,
                          MPA_INT *window, int *dither_state,
diff --git a/libavcodec/mpegaudio_parser.c b/libavcodec/mpegaudio_parser.c
index e7cb743..e4b17b0 100644
--- a/libavcodec/mpegaudio_parser.c
+++ b/libavcodec/mpegaudio_parser.c
@@ -44,7 +44,7 @@ typedef struct MpegAudioParseContext {
 
 /* useful helper to get mpeg audio stream infos. Return -1 if error in
    header, otherwise the coded frame size in bytes */
-int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate)
+int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate, int *channels, int *frame_size, int *bit_rate)
 {
     MPADecodeContext s1, *s = &s1;
     s1.avctx = avctx;
@@ -58,23 +58,23 @@ int ff_mpa_decode_header(AVCodecContext *avctx, uint32_t head, int *sample_rate)
 
     switch(s->layer) {
     case 1:
-        avctx->frame_size = 384;
+        *frame_size = 384;
         break;
     case 2:
-        avctx->frame_size = 1152;
+        *frame_size = 1152;
         break;
     default:
     case 3:
         if (s->lsf)
-            avctx->frame_size = 576;
+            *frame_size = 576;
         else
-            avctx->frame_size = 1152;
+            *frame_size = 1152;
         break;
     }
 
     *sample_rate = s->sample_rate;
-    avctx->channels = s->nb_channels;
-    avctx->bit_rate = s->bit_rate;
+    *channels = s->nb_channels;
+    *bit_rate = s->bit_rate;
     avctx->sub_id = s->layer;
     return s->frame_size;
 }
@@ -92,7 +92,7 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
                            const uint8_t *buf, int buf_size)
 {
     MpegAudioParseContext *s = s1->priv_data;
-    int len, ret, sr;
+    int len, ret, sr, channels, bit_rate, frame_size;
     uint32_t header;
     const uint8_t *buf_ptr;
 
@@ -123,7 +123,7 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
             got_header:
                 header = AV_RB32(s->inbuf);
 
-                ret = ff_mpa_decode_header(avctx, header, &sr);
+                ret = ff_mpa_decode_header(avctx, header, &sr, &channels, &frame_size, &bit_rate);
                 if (ret < 0) {
                     s->header_count= -2;
                     /* no sync found : move by one byte (inefficient, but simple!) */
@@ -146,8 +146,12 @@ static int mpegaudio_parse(AVCodecParserContext *s1,
                         s->frame_size = -1;
                     }
 #endif
-                    if(s->header_count > 1)
+                    if(s->header_count > 1){
                         avctx->sample_rate= sr;
+                        avctx->channels   = channels;
+                        avctx->frame_size = frame_size;
+                        avctx->bit_rate   = bit_rate;
+                    }
                 }
             }
         } else
diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index a277492..a853562 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -351,9 +351,9 @@ static int decode_init(AVCodecContext * avctx)
             int n, norm;
             n = i + 2;
             norm = ((INT64_C(1) << n) * FRAC_ONE) / ((1 << n) - 1);
-            scale_factor_mult[i][0] = MULL(FIXR(1.0 * 2.0), norm);
-            scale_factor_mult[i][1] = MULL(FIXR(0.7937005259 * 2.0), norm);
-            scale_factor_mult[i][2] = MULL(FIXR(0.6299605249 * 2.0), norm);
+            scale_factor_mult[i][0] = MULL(FIXR(1.0 * 2.0), norm, FRAC_BITS);
+            scale_factor_mult[i][1] = MULL(FIXR(0.7937005259 * 2.0), norm, FRAC_BITS);
+            scale_factor_mult[i][2] = MULL(FIXR(0.6299605249 * 2.0), norm, FRAC_BITS);
             dprintf(avctx, "%d: norm=%x s=%x %x %x\n",
                     i, norm,
                     scale_factor_mult[i][0],
@@ -1097,7 +1097,7 @@ static void imdct36(int *out, int *buf, int *in, int *win)
         t2 = tmp[i + 1];
         t3 = tmp[i + 3];
         s1 = MULH(2*(t3 + t2), icos36h[j]);
-        s3 = MULL(t3 - t2, icos36[8 - j]);
+        s3 = MULL(t3 - t2, icos36[8 - j], FRAC_BITS);
 
         t0 = s0 + s1;
         t1 = s0 - s1;
@@ -1705,8 +1705,8 @@ static void compute_stereo(MPADecodeContext *s,
                     v2 = is_tab[1][sf];
                     for(j=0;j<len;j++) {
                         tmp0 = tab0[j];
-                        tab0[j] = MULL(tmp0, v1);
-                        tab1[j] = MULL(tmp0, v2);
+                        tab0[j] = MULL(tmp0, v1, FRAC_BITS);
+                        tab1[j] = MULL(tmp0, v2, FRAC_BITS);
                     }
                 } else {
                 found1:
@@ -1716,8 +1716,8 @@ static void compute_stereo(MPADecodeContext *s,
                         for(j=0;j<len;j++) {
                             tmp0 = tab0[j];
                             tmp1 = tab1[j];
-                            tab0[j] = MULL(tmp0 + tmp1, ISQRT2);
-                            tab1[j] = MULL(tmp0 - tmp1, ISQRT2);
+                            tab0[j] = MULL(tmp0 + tmp1, ISQRT2, FRAC_BITS);
+                            tab1[j] = MULL(tmp0 - tmp1, ISQRT2, FRAC_BITS);
                         }
                     }
                 }
@@ -1749,8 +1749,8 @@ static void compute_stereo(MPADecodeContext *s,
                 v2 = is_tab[1][sf];
                 for(j=0;j<len;j++) {
                     tmp0 = tab0[j];
-                    tab0[j] = MULL(tmp0, v1);
-                    tab1[j] = MULL(tmp0, v2);
+                    tab0[j] = MULL(tmp0, v1, FRAC_BITS);
+                    tab1[j] = MULL(tmp0, v2, FRAC_BITS);
                 }
             } else {
             found2:
@@ -1760,8 +1760,8 @@ static void compute_stereo(MPADecodeContext *s,
                     for(j=0;j<len;j++) {
                         tmp0 = tab0[j];
                         tmp1 = tab1[j];
-                        tab0[j] = MULL(tmp0 + tmp1, ISQRT2);
-                        tab1[j] = MULL(tmp0 - tmp1, ISQRT2);
+                        tab0[j] = MULL(tmp0 + tmp1, ISQRT2, FRAC_BITS);
+                        tab1[j] = MULL(tmp0 - tmp1, ISQRT2, FRAC_BITS);
                     }
                 }
             }
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index 4bc32a2..43a6e85 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -54,9 +54,9 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
 static void dct_unquantize_h263_inter_c(MpegEncContext *s,
                                   DCTELEM *block, int n, int qscale);
 
-extern int  XVMC_field_start(MpegEncContext*s, AVCodecContext *avctx);
-extern void XVMC_field_end(MpegEncContext *s);
-extern void XVMC_decode_mb(MpegEncContext *s);
+int  XVMC_field_start(MpegEncContext*s, AVCodecContext *avctx);
+void XVMC_field_end(MpegEncContext *s);
+void XVMC_decode_mb(MpegEncContext *s);
 
 
 /* enable all paranoid tests for rounding, overflows, etc... */
@@ -129,8 +129,8 @@ int ff_dct_common_init(MpegEncContext *s)
     MPV_common_init_mlib(s);
 #elif defined(HAVE_MMI)
     MPV_common_init_mmi(s);
-#elif defined(ARCH_ARMV4L)
-    MPV_common_init_armv4l(s);
+#elif defined(ARCH_ARM)
+    MPV_common_init_arm(s);
 #elif defined(HAVE_ALTIVEC)
     MPV_common_init_altivec(s);
 #elif defined(ARCH_BFIN)
@@ -289,6 +289,7 @@ static int init_duplicate_context(MpegEncContext *s, MpegEncContext *base){
 
      //FIXME should be linesize instead of s->width*2 but that is not known before get_buffer()
     CHECKED_ALLOCZ(s->me.scratchpad,  (s->width+64)*4*16*2*sizeof(uint8_t))
+    s->me.temp=         s->me.scratchpad;
     s->rd_scratchpad=   s->me.scratchpad;
     s->b_scratchpad=    s->me.scratchpad;
     s->obmc_scratchpad= s->me.scratchpad + 16;
@@ -315,6 +316,7 @@ static void free_duplicate_context(MpegEncContext *s){
 
     av_freep(&s->allocated_edge_emu_buffer); s->edge_emu_buffer= NULL;
     av_freep(&s->me.scratchpad);
+    s->me.temp=
     s->rd_scratchpad=
     s->b_scratchpad=
     s->obmc_scratchpad= NULL;
@@ -331,6 +333,7 @@ static void backup_duplicate_context(MpegEncContext *bak, MpegEncContext *src){
     COPY(allocated_edge_emu_buffer);
     COPY(edge_emu_buffer);
     COPY(me.scratchpad);
+    COPY(me.temp);
     COPY(rd_scratchpad);
     COPY(b_scratchpad);
     COPY(obmc_scratchpad);
@@ -936,7 +939,7 @@ alloc:
         update_noise_reduction(s);
     }
 
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
     if(s->avctx->xvmc_acceleration)
         return XVMC_field_start(s, avctx);
 #endif
@@ -948,13 +951,17 @@ void MPV_frame_end(MpegEncContext *s)
 {
     int i;
     /* draw edge for correct motion prediction if outside */
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
 //just to make sure that all data is rendered.
     if(s->avctx->xvmc_acceleration){
         XVMC_field_end(s);
     }else
 #endif
-    if(s->unrestricted_mv && s->current_picture.reference && !s->intra_only && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
+    if(!(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
+       && s->unrestricted_mv
+       && s->current_picture.reference
+       && !s->intra_only
+       && !(s->flags&CODEC_FLAG_EMU_EDGE)) {
             s->dsp.draw_edges(s->current_picture.data[0], s->linesize  , s->h_edge_pos   , s->v_edge_pos   , EDGE_WIDTH  );
             s->dsp.draw_edges(s->current_picture.data[1], s->uvlinesize, s->h_edge_pos>>1, s->v_edge_pos>>1, EDGE_WIDTH/2);
             s->dsp.draw_edges(s->current_picture.data[2], s->uvlinesize, s->h_edge_pos>>1, s->v_edge_pos>>1, EDGE_WIDTH/2);
@@ -1729,7 +1736,7 @@ void MPV_decode_mb_internal(MpegEncContext *s, DCTELEM block[12][64],
 {
     int mb_x, mb_y;
     const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
-#ifdef HAVE_XVMC
+#ifdef CONFIG_XVMC
     if(s->avctx->xvmc_acceleration){
         XVMC_decode_mb(s);//xvmc uses pblocks
         return;
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index 4ddd30c..da08bb6 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -546,6 +546,9 @@ typedef struct MpegEncContext {
     int mpeg_quant;
     int t_frame;                       ///< time distance of first I -> B, used for interlaced b frames
     int padding_bug_score;             ///< used to detect the VERY common padding bug in MPEG4
+    int cplx_estimation_trash_i;
+    int cplx_estimation_trash_p;
+    int cplx_estimation_trash_b;
 
     /* divx specific, used to workaround (many) bugs in divx5 */
     int divx_version;
@@ -681,7 +684,7 @@ void MPV_common_init_mmx(MpegEncContext *s);
 void MPV_common_init_axp(MpegEncContext *s);
 void MPV_common_init_mlib(MpegEncContext *s);
 void MPV_common_init_mmi(MpegEncContext *s);
-void MPV_common_init_armv4l(MpegEncContext *s);
+void MPV_common_init_arm(MpegEncContext *s);
 void MPV_common_init_altivec(MpegEncContext *s);
 void ff_clean_intra_table_entries(MpegEncContext *s);
 void ff_draw_horiz_band(MpegEncContext *s, int y, int h);
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index 6c4a028..72af47a 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -35,6 +35,7 @@
 #include "msmpeg4.h"
 #include "h263.h"
 #include "faandct.h"
+#include "aandcttab.h"
 #include <limits.h>
 
 //#undef NDEBUG
@@ -49,29 +50,6 @@ static int sse_mb(MpegEncContext *s);
 
 //#define DEBUG
 
-static const uint16_t aanscales[64] = {
-    /* precomputed values scaled up by 14 bits */
-    16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-    22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
-    21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
-    19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
-    16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-    12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
-    8867 , 12299, 11585, 10426,  8867,  6967,  4799,  2446,
-    4520 ,  6270,  5906,  5315,  4520,  3552,  2446,  1247
-};
-
-const uint16_t ff_inv_aanscales[64] = {
-  4096,  2953,  3135,  3483,  4096,  5213,  7568, 14846,
-  2953,  2129,  2260,  2511,  2953,  3759,  5457, 10703,
-  3135,  2260,  2399,  2666,  3135,  3990,  5793, 11363,
-  3483,  2511,  2666,  2962,  3483,  4433,  6436, 12625,
-  4096,  2953,  3135,  3483,  4096,  5213,  7568, 14846,
-  5213,  3759,  3990,  4433,  5213,  6635,  9633, 18895,
-  7568,  5457,  5793,  6436,  7568,  9633, 13985, 27432,
- 14846, 10703, 11363, 12625, 14846, 18895, 27432, 53809,
-};
-
 static uint8_t default_mv_penalty[MAX_FCODE+1][MAX_MV*2+1];
 static uint8_t default_fcode_tab[MAX_MV*2+1];
 
@@ -91,9 +69,9 @@ void ff_convert_matrix(DSPContext *dsp, int (*qmat)[64], uint16_t (*qmat16)[2][6
             for(i=0;i<64;i++) {
                 const int j= dsp->idct_permutation[i];
                 /* 16 <= qscale * quant_matrix[i] <= 7905 */
-                /* 19952         <= aanscales[i] * qscale * quant_matrix[i]           <= 249205026 */
-                /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
-                /* 3444240       >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */
+                /* 19952             <= ff_aanscales[i] * qscale * quant_matrix[i]               <= 249205026 */
+                /* (1 << 36) / 19952 >= (1 << 36) / (ff_aanscales[i] * qscale * quant_matrix[i]) >= (1 << 36) / 249205026 */
+                /* 3444240           >= (1 << 36) / (ff_aanscales[i] * qscale * quant_matrix[i]) >= 275 */
 
                 qmat[qscale][i] = (int)((UINT64_C(1) << QMAT_SHIFT) /
                                 (qscale * quant_matrix[j]));
@@ -106,12 +84,12 @@ void ff_convert_matrix(DSPContext *dsp, int (*qmat)[64], uint16_t (*qmat16)[2][6
             for(i=0;i<64;i++) {
                 const int j= dsp->idct_permutation[i];
                 /* 16 <= qscale * quant_matrix[i] <= 7905 */
-                /* 19952         <= aanscales[i] * qscale * quant_matrix[i]           <= 249205026 */
-                /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
-                /* 3444240       >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */
+                /* 19952             <= ff_aanscales[i] * qscale * quant_matrix[i]               <= 249205026 */
+                /* (1 << 36) / 19952 >= (1 << 36) / (ff_aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
+                /* 3444240           >= (1 << 36) / (ff_aanscales[i] * qscale * quant_matrix[i]) >= 275 */
 
                 qmat[qscale][i] = (int)((UINT64_C(1) << (QMAT_SHIFT + 14)) /
-                                (aanscales[i] * qscale * quant_matrix[j]));
+                                (ff_aanscales[i] * qscale * quant_matrix[j]));
             }
         } else {
             for(i=0;i<64;i++) {
@@ -137,7 +115,7 @@ void ff_convert_matrix(DSPContext *dsp, int (*qmat)[64], uint16_t (*qmat16)[2][6
                    || dsp->fdct == ff_faandct
 #endif
                    ) {
-                max= (8191LL*aanscales[i]) >> 14;
+                max = (8191LL*ff_aanscales[i]) >> 14;
             }
             while(((max * qmat[qscale][i]) >> shift) > INT_MAX){
                 shift++;
@@ -1253,7 +1231,7 @@ vbv_retry:
 
         if(avctx->rc_buffer_size){
             RateControlContext *rcc= &s->rc_context;
-            int max_size= rcc->buffer_index/3;
+            int max_size= rcc->buffer_index * avctx->rc_max_available_vbv_use;
 
             if(put_bits_count(&s->pb) > max_size && s->lambda < s->avctx->lmax){
                 s->next_lambda= FFMAX(s->lambda+1, s->lambda*(s->qscale+1) / s->qscale);
diff --git a/libavcodec/msmpeg4.h b/libavcodec/msmpeg4.h
index 3226015..1442959 100644
--- a/libavcodec/msmpeg4.h
+++ b/libavcodec/msmpeg4.h
@@ -52,7 +52,9 @@ int ff_wmv2_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
 #define ENABLE_MSMPEG4_DECODER (ENABLE_MSMPEG4V1_DECODER || \
                                 ENABLE_MSMPEG4V2_DECODER || \
                                 ENABLE_MSMPEG4V3_DECODER || \
-                                ENABLE_WMV2_DECODER)
+                                ENABLE_WMV2_DECODER      || \
+                                ENABLE_VC1_DECODER       || \
+                                ENABLE_WMV3_DECODER)
 #define ENABLE_MSMPEG4_ENCODER (ENABLE_MSMPEG4V1_ENCODER || \
                                 ENABLE_MSMPEG4V2_ENCODER || \
                                 ENABLE_MSMPEG4V3_ENCODER || \
diff --git a/libavcodec/msmpeg4data.c b/libavcodec/msmpeg4data.c
index da899b5..07d11f7 100644
--- a/libavcodec/msmpeg4data.c
+++ b/libavcodec/msmpeg4data.c
@@ -35,22 +35,22 @@ VLC ff_msmp4_dc_chroma_vlc[2];
 
 /* intra picture macroblock coded block pattern */
 const uint16_t ff_msmp4_mb_i_table[64][2] = {
-{ 0x1, 1 },{ 0x17, 6 },{ 0x9, 5 },{ 0x5, 5 },
-{ 0x6, 5 },{ 0x47, 9 },{ 0x20, 7 },{ 0x10, 7 },
-{ 0x2, 5 },{ 0x7c, 9 },{ 0x3a, 7 },{ 0x1d, 7 },
-{ 0x2, 6 },{ 0xec, 9 },{ 0x77, 8 },{ 0x0, 8 },
-{ 0x3, 5 },{ 0xb7, 9 },{ 0x2c, 7 },{ 0x13, 7 },
-{ 0x1, 6 },{ 0x168, 10 },{ 0x46, 8 },{ 0x3f, 8 },
-{ 0x1e, 6 },{ 0x712, 13 },{ 0xb5, 9 },{ 0x42, 8 },
-{ 0x22, 7 },{ 0x1c5, 11 },{ 0x11e, 10 },{ 0x87, 9 },
-{ 0x6, 4 },{ 0x3, 9 },{ 0x1e, 7 },{ 0x1c, 6 },
-{ 0x12, 7 },{ 0x388, 12 },{ 0x44, 9 },{ 0x70, 9 },
-{ 0x1f, 6 },{ 0x23e, 11 },{ 0x39, 8 },{ 0x8e, 9 },
-{ 0x1, 7 },{ 0x1c6, 11 },{ 0xb6, 9 },{ 0x45, 9 },
-{ 0x14, 6 },{ 0x23f, 11 },{ 0x7d, 9 },{ 0x18, 9 },
-{ 0x7, 7 },{ 0x1c7, 11 },{ 0x86, 9 },{ 0x19, 9 },
-{ 0x15, 6 },{ 0x1db, 10 },{ 0x2, 9 },{ 0x46, 9 },
-{ 0xd, 8 },{ 0x713, 13 },{ 0x1da, 10 },{ 0x169, 10 },
+{  0x1, 1 },{  0x17,  6 },{   0x9,  5 },{   0x5,  5 },
+{  0x6, 5 },{  0x47,  9 },{  0x20,  7 },{  0x10,  7 },
+{  0x2, 5 },{  0x7c,  9 },{  0x3a,  7 },{  0x1d,  7 },
+{  0x2, 6 },{  0xec,  9 },{  0x77,  8 },{   0x0,  8 },
+{  0x3, 5 },{  0xb7,  9 },{  0x2c,  7 },{  0x13,  7 },
+{  0x1, 6 },{ 0x168, 10 },{  0x46,  8 },{  0x3f,  8 },
+{ 0x1e, 6 },{ 0x712, 13 },{  0xb5,  9 },{  0x42,  8 },
+{ 0x22, 7 },{ 0x1c5, 11 },{ 0x11e, 10 },{  0x87,  9 },
+{  0x6, 4 },{   0x3,  9 },{  0x1e,  7 },{  0x1c,  6 },
+{ 0x12, 7 },{ 0x388, 12 },{  0x44,  9 },{  0x70,  9 },
+{ 0x1f, 6 },{ 0x23e, 11 },{  0x39,  8 },{  0x8e,  9 },
+{  0x1, 7 },{ 0x1c6, 11 },{  0xb6,  9 },{  0x45,  9 },
+{ 0x14, 6 },{ 0x23f, 11 },{  0x7d,  9 },{  0x18,  9 },
+{  0x7, 7 },{ 0x1c7, 11 },{  0x86,  9 },{  0x19,  9 },
+{ 0x15, 6 },{ 0x1db, 10 },{   0x2,  9 },{  0x46,  9 },
+{  0xd, 8 },{ 0x713, 13 },{ 0x1da, 10 },{ 0x169, 10 },
 };
 
 /* non intra picture macroblock coded block pattern + mb type */
diff --git a/libavcodec/msrledec.c b/libavcodec/msrledec.c
index f44b8b6..7d09ed9 100644
--- a/libavcodec/msrledec.c
+++ b/libavcodec/msrledec.c
@@ -145,8 +145,10 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVPicture *pic, int de
             p2 = *src++;
             if(p2 == 0) { //End-of-line
                 output = pic->data[0] + (--line) * pic->linesize[0];
-                if (line < 0)
+                if (line < 0){
+                    av_log(avctx, AV_LOG_ERROR, "Next line is beyond picture bounds\n");
                     return -1;
+                }
                 pos = 0;
                 continue;
             } else if(p2 == 1) { //End-of-picture
@@ -155,8 +157,10 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVPicture *pic, int de
                 p1 = *src++;
                 p2 = *src++;
                 line -= p2;
-                if (line < 0)
+                if (line < 0){
+                    av_log(avctx, AV_LOG_ERROR, "Skip beyond picture bounds\n");
                     return -1;
+                }
                 pos += p1;
                 output = pic->data[0] + line * pic->linesize[0] + pos * (depth >> 3);
                 continue;
@@ -191,13 +195,12 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVPicture *pic, int de
             }
             pos += p2;
         } else { //Run of pixels
-            int pix[4]; //original pixel
+            uint8_t pix[3]; //original pixel
             switch(depth){
             case  8: pix[0] = *src++;
                      break;
             case 16: pix16 = AV_RL16(src);
                      src += 2;
-                     *(uint16_t*)pix = pix16;
                      break;
             case 24: pix[0] = *src++;
                      pix[1] = *src++;
@@ -205,7 +208,6 @@ static int msrle_decode_8_16_24_32(AVCodecContext *avctx, AVPicture *pic, int de
                      break;
             case 32: pix32 = AV_RL32(src);
                      src += 4;
-                     *(uint32_t*)pix = pix32;
                      break;
             }
             if (output + p1 * (depth >> 3) > output_end)
diff --git a/libavcodec/nellymoserdec.c b/libavcodec/nellymoserdec.c
index f2a413e..dfbb41c 100644
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@@ -148,6 +148,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
         ff_sine_window_init(sine_window, 128);
 
     avctx->sample_fmt = SAMPLE_FMT_S16;
+    avctx->channel_layout = CH_LAYOUT_MONO;
     return 0;
 }
 
diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c
index 03db30c..13fe64d 100644
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@@ -28,7 +28,7 @@
  *
  * Generic codec information: libavcodec/nellymoserdec.c
  *
- * Some information also from: http://www1.mplayerhq.hu/ASAO/ASAO.zip
+ * Some information also from: http://samples.mplayerhq.hu/A-codecs/Nelly_Moser/ASAO/ASAO.zip
  *                             (Copyright Joseph Artsimovich and UAB "DKD")
  *
  * for more information about nellymoser format, visit:
diff --git a/libavcodec/opt.c b/libavcodec/opt.c
index acf94ba..78fbfae 100644
--- a/libavcodec/opt.c
+++ b/libavcodec/opt.c
@@ -47,15 +47,17 @@ const AVOption *av_next_option(void *obj, const AVOption *last){
     else                     return (*(AVClass**)obj)->option;
 }
 
-static const AVOption *av_set_number(void *obj, const char *name, double num, int den, int64_t intnum){
+static int av_set_number2(void *obj, const char *name, double num, int den, int64_t intnum, const AVOption **o_out){
     const AVOption *o= av_find_opt(obj, name, NULL, 0, 0);
     void *dst;
+    if(o_out)
+        *o_out= o;
     if(!o || o->offset<=0)
-        return NULL;
+        return AVERROR(ENOENT);
 
     if(o->max*den < num*intnum || o->min*den > num*intnum) {
-        av_log(NULL, AV_LOG_ERROR, "Value %lf for parameter '%s' out of range.\n", num, name);
-        return NULL;
+        av_log(NULL, AV_LOG_ERROR, "Value %lf for parameter '%s' out of range\n", num, name);
+        return AVERROR(ERANGE);
     }
 
     dst= ((uint8_t*)obj) + o->offset;
@@ -71,27 +73,17 @@ static const AVOption *av_set_number(void *obj, const char *name, double num, in
         else                *(AVRational*)dst= av_d2q(num*intnum/den, 1<<24);
         break;
     default:
-        return NULL;
+        return AVERROR(EINVAL);
     }
-    return o;
+    return 0;
 }
 
-static const AVOption *set_all_opt(void *v, const char *unit, double d){
-    AVClass *c= *(AVClass**)v; //FIXME silly way of storing AVClass
-    const AVOption *o= c->option;
-    const AVOption *ret=NULL;
-
-    for(;o && o->name; o++){
-        if(o->type != FF_OPT_TYPE_CONST && o->unit && !strcmp(o->unit, unit)){
-            double tmp= d;
-            if(o->type == FF_OPT_TYPE_FLAGS)
-                tmp= av_get_int(v, o->name, NULL) | (int64_t)d;
-
-            av_set_number(v, o->name, tmp, 1, 1);
-            ret= o;
-        }
-    }
-    return ret;
+static const AVOption *av_set_number(void *obj, const char *name, double num, int den, int64_t intnum){
+    const AVOption *o = NULL;
+    if (av_set_number2(obj, name, num, den, intnum, &o) < 0)
+        return NULL;
+    else
+        return o;
 }
 
 static const double const_values[]={
@@ -115,13 +107,16 @@ static int hexchar2int(char c) {
     return -1;
 }
 
-const AVOption *av_set_string2(void *obj, const char *name, const char *val, int alloc){
+int av_set_string3(void *obj, const char *name, const char *val, int alloc, const AVOption **o_out){
+    int ret;
     const AVOption *o= av_find_opt(obj, name, NULL, 0, 0);
-    if(o && o->offset==0 && o->type == FF_OPT_TYPE_CONST && o->unit){
-        return set_all_opt(obj, o->unit, o->default_val);
-    }
-    if(!o || !val || o->offset<=0)
-        return NULL;
+    if (o_out)
+        *o_out = o;
+    if(!o)
+        return AVERROR(ENOENT);
+    if(!val || o->offset<=0)
+        return AVERROR(EINVAL);
+
     if(o->type == FF_OPT_TYPE_BINARY){
         uint8_t **dst = (uint8_t **)(((uint8_t*)obj) + o->offset);
         int *lendst = (int *)(dst + 1);
@@ -129,7 +124,7 @@ const AVOption *av_set_string2(void *obj, const char *name, const char *val, int
         int len = strlen(val);
         av_freep(dst);
         *lendst = 0;
-        if (len & 1) return NULL;
+        if (len & 1) return AVERROR(EINVAL);
         len /= 2;
         ptr = bin = av_malloc(len);
         while (*val) {
@@ -137,13 +132,13 @@ const AVOption *av_set_string2(void *obj, const char *name, const char *val, int
             int b = hexchar2int(*val++);
             if (a < 0 || b < 0) {
                 av_free(bin);
-                return NULL;
+                return AVERROR(EINVAL);
             }
             *ptr++ = (a << 4) | b;
         }
         *dst = bin;
         *lendst = len;
-        return o;
+        return 0;
     }
     if(o->type != FF_OPT_TYPE_STRING){
         int notfirst=0;
@@ -174,7 +169,7 @@ const AVOption *av_set_string2(void *obj, const char *name, const char *val, int
                 else {
                     if (error)
                         av_log(NULL, AV_LOG_ERROR, "Unable to parse option value \"%s\": %s\n", val, error);
-                    return NULL;
+                    return AVERROR(EINVAL);
                 }
             }
             if(o->type == FF_OPT_TYPE_FLAGS){
@@ -185,14 +180,14 @@ const AVOption *av_set_string2(void *obj, const char *name, const char *val, int
                 else if(cmd=='-') d= notfirst*av_get_double(obj, name, NULL) - d;
             }
 
-            if (!av_set_number(obj, name, d, 1, 1))
-                return NULL;
+            if ((ret = av_set_number2(obj, name, d, 1, 1, o_out)) < 0)
+                return ret;
             val+= i;
             if(!*val)
-                return o;
+                return 0;
             notfirst=1;
         }
-        return NULL;
+        return AVERROR(EINVAL);
     }
 
     if(alloc){
@@ -201,12 +196,24 @@ const AVOption *av_set_string2(void *obj, const char *name, const char *val, int
     }
 
     memcpy(((uint8_t*)obj) + o->offset, &val, sizeof(val));
+    return 0;
+}
+
+#if LIBAVCODEC_VERSION_MAJOR < 53
+const AVOption *av_set_string2(void *obj, const char *name, const char *val, int alloc){
+    const AVOption *o;
+    if (av_set_string3(obj, name, val, alloc, &o) < 0)
+        return NULL;
     return o;
 }
 
 const AVOption *av_set_string(void *obj, const char *name, const char *val){
-    return av_set_string2(obj, name, val, 0);
+    const AVOption *o;
+    if (av_set_string3(obj, name, val, 0, &o) < 0)
+        return NULL;
+    return o;
 }
+#endif
 
 const AVOption *av_set_double(void *obj, const char *name, double n){
     return av_set_number(obj, name, n, 1, 1);
@@ -415,6 +422,11 @@ void av_opt_set_defaults2(void *s, int mask, int flags)
                 av_set_int(s, opt->name, val);
             }
             break;
+            case FF_OPT_TYPE_INT64:
+                if((double)(opt->default_val+0.6) == opt->default_val)
+                    av_log(s, AV_LOG_DEBUG, "loss of precission in default of %s\n", opt->name);
+                av_set_int(s, opt->name, opt->default_val);
+            break;
             case FF_OPT_TYPE_FLOAT: {
                 double val;
                 val = opt->default_val;
diff --git a/libavcodec/opt.h b/libavcodec/opt.h
index 557c430..c968930 100644
--- a/libavcodec/opt.h
+++ b/libavcodec/opt.h
@@ -91,7 +91,7 @@ typedef struct AVOption {
  * for which it is the case that opt->flags & mask == flags).
  *
  * @param[in] obj a pointer to a struct whose first element is a
- * pointer to an #AVClass
+ * pointer to an AVClass
  * @param[in] name the name of the option to look for
  * @param[in] unit the unit of the option to look for, or any if NULL
  * @return a pointer to the option found, or NULL if no option
@@ -99,12 +99,22 @@ typedef struct AVOption {
  */
 const AVOption *av_find_opt(void *obj, const char *name, const char *unit, int mask, int flags);
 
+#if LIBAVCODEC_VERSION_MAJOR < 53
 /**
  * @see av_set_string2()
  */
 attribute_deprecated const AVOption *av_set_string(void *obj, const char *name, const char *val);
 
 /**
+ * @return a pointer to the AVOption corresponding to the field set or
+ * NULL if no matching AVOption exists, or if the value \p val is not
+ * valid
+ * @see av_set_string3()
+ */
+attribute_deprecated const AVOption *av_set_string2(void *obj, const char *name, const char *val, int alloc);
+#endif
+
+/**
  * Sets the field of obj with the given name to value.
  *
  * @param[in] obj A struct whose first element is a pointer to an
@@ -120,14 +130,15 @@ attribute_deprecated const AVOption *av_set_string(void *obj, const char *name,
  * scalars or named flags separated by '+' or '-'. Prefixing a flag
  * with '+' causes it to be set without affecting the other flags;
  * similarly, '-' unsets a flag.
- * @return a pointer to the AVOption corresponding to the field set or
- * NULL if no matching AVOption exists, or if the value \p val is not
- * valid
+ * @param[out] o_out if non-NULL put here a pointer to the AVOption
+ * found
  * @param alloc when 1 then the old value will be av_freed() and the
  *                     new av_strduped()
  *              when 0 then no av_free() nor av_strdup() will be used
+ * @return 0 if the value has been set, an AVERROR* error code if no
+ * matching option exists, or if the value \p val is not valid
  */
-const AVOption *av_set_string2(void *obj, const char *name, const char *val, int alloc);
+int av_set_string3(void *obj, const char *name, const char *val, int alloc, const AVOption **o_out);
 
 const AVOption *av_set_double(void *obj, const char *name, double n);
 const AVOption *av_set_q(void *obj, const char *name, AVRational n);
diff --git a/libavcodec/png.h b/libavcodec/png.h
index 0fde3f4..17b1466 100644
--- a/libavcodec/png.h
+++ b/libavcodec/png.h
@@ -64,14 +64,13 @@ extern const uint8_t ff_png_pass_xshift[NB_PASSES];
 /* Mask to determine which pixels are valid in a pass */
 extern const uint8_t ff_png_pass_mask[NB_PASSES];
 
-extern void *ff_png_zalloc(void *opaque, unsigned int items,
-                           unsigned int size);
+void *ff_png_zalloc(void *opaque, unsigned int items, unsigned int size);
 
-extern void ff_png_zfree(void *opaque, void *ptr);
+void ff_png_zfree(void *opaque, void *ptr);
 
-extern int ff_png_get_nb_channels(int color_type);
+int ff_png_get_nb_channels(int color_type);
 
 /* compute the row size of an interleaved pass */
-extern int ff_png_pass_row_size(int pass, int bits_per_pixel, int width);
+int ff_png_pass_row_size(int pass, int bits_per_pixel, int width);
 
 #endif /* AVCODEC_PNG_H */
diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c
index 6ff219e..44cce6a 100644
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -26,6 +26,7 @@
 
 #include "dsputil_ppc.h"
 #include "util_altivec.h"
+#include "types_altivec.h"
 
 int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 {
@@ -573,6 +574,20 @@ void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
     }
 }
 
+
+static void clear_block_altivec(DCTELEM *block) {
+    LOAD_ZERO;
+    vec_st(zero_s16v,   0, block);
+    vec_st(zero_s16v,  16, block);
+    vec_st(zero_s16v,  32, block);
+    vec_st(zero_s16v,  48, block);
+    vec_st(zero_s16v,  64, block);
+    vec_st(zero_s16v,  80, block);
+    vec_st(zero_s16v,  96, block);
+    vec_st(zero_s16v, 112, block);
+}
+
+
 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
     register int i;
     register vector unsigned char vdst, vsrc;
@@ -1420,6 +1435,7 @@ void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
     c->pix_sum = pix_sum_altivec;
     c->diff_pixels = diff_pixels_altivec;
     c->get_pixels = get_pixels_altivec;
+    c->clear_block = clear_block_altivec;
     c->add_bytes= add_bytes_altivec;
     c->put_pixels_tab[0][0] = put_pixels16_altivec;
     /* the two functions do the same thing, so use the same code */
diff --git a/libavcodec/ppc/dsputil_altivec.h b/libavcodec/ppc/dsputil_altivec.h
index 63f817a..03ff0b9 100644
--- a/libavcodec/ppc/dsputil_altivec.h
+++ b/libavcodec/ppc/dsputil_altivec.h
@@ -25,7 +25,7 @@
 
 #include <stdint.h>
 
-extern int has_altivec(void);
+int has_altivec(void);
 
 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
 
diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c
index 443e1db..124453e 100644
--- a/libavcodec/ppc/dsputil_ppc.c
+++ b/libavcodec/ppc/dsputil_ppc.c
@@ -27,11 +27,11 @@
 #ifdef HAVE_ALTIVEC
 #include "dsputil_altivec.h"
 
-extern void fdct_altivec(int16_t *block);
-extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
-                         int x16, int y16, int rounder);
-extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
-extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
+void fdct_altivec(int16_t *block);
+void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
+                  int x16, int y16, int rounder);
+void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
+void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
 
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
 
diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c
index ddf142b..19123d0 100644
--- a/libavcodec/ppc/fft_altivec.c
+++ b/libavcodec/ppc/fft_altivec.c
@@ -26,31 +26,6 @@
 
 #include "dsputil_ppc.h"
 #include "util_altivec.h"
-/*
-  those three macros are from libavcodec/fft.c
-  and are required for the reference C code
-*/
-/* butter fly op */
-#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
-{\
-    FFTSample ax, ay, bx, by;\
-    bx=pre1;\
-    by=pim1;\
-    ax=qre1;\
-    ay=qim1;\
-    pre = (bx + ax);\
-    pim = (by + ay);\
-    qre = (bx - ax);\
-    qim = (by - ay);\
-}
-#define MUL16(a,b) ((a) * (b))
-#define CMUL(pre, pim, are, aim, bre, bim) \
-{\
-    pre = (MUL16(are, bre) - MUL16(aim, bim));\
-    pim = (MUL16(are, bim) + MUL16(bre, aim));\
-}
-
-
 /**
  * Do a complex FFT with the parameters defined in ff_fft_init(). The
  * input data must be permuted before with s->revtab table. No
diff --git a/libavcodec/ppc/float_altivec.c b/libavcodec/ppc/float_altivec.c
index 1f39d8f..fd4aa53 100644
--- a/libavcodec/ppc/float_altivec.c
+++ b/libavcodec/ppc/float_altivec.c
@@ -23,6 +23,7 @@
 #include "gcc_fixes.h"
 
 #include "dsputil_altivec.h"
+#include "util_altivec.h"
 
 static void vector_fmul_altivec(float *dst, const float *src, int len)
 {
@@ -149,6 +150,67 @@ static void vector_fmul_add_add_altivec(float *dst, const float *src0,
         ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
 }
 
+static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len)
+{
+    union {
+        vector float v;
+        float s[4];
+    } vadd;
+    vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj;
+    const vector unsigned char reverse = vcprm(3,2,1,0);
+    int i,j;
+
+    dst += len;
+    win += len;
+    src0+= len;
+
+    vadd.s[0] = add_bias;
+    vadd_bias = vec_splat(vadd.v, 0);
+    zero = (vector float)vec_splat_u32(0);
+
+    for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
+        s0 = vec_ld(i, src0);
+        s1 = vec_ld(j, src1);
+        wi = vec_ld(i, win);
+        wj = vec_ld(j, win);
+
+        s1 = vec_perm(s1, s1, reverse);
+        wj = vec_perm(wj, wj, reverse);
+
+        t0 = vec_madd(s0, wj, vadd_bias);
+        t0 = vec_nmsub(s1, wi, t0);
+        t1 = vec_madd(s0, wi, vadd_bias);
+        t1 = vec_madd(s1, wj, t1);
+        t1 = vec_perm(t1, t1, reverse);
+
+        vec_st(t0, i, dst);
+        vec_st(t1, j, dst);
+    }
+}
+
+static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
+{
+    union {
+        vector float v;
+        float s[4];
+    } mul_u;
+    int i;
+    vector float src1, src2, dst1, dst2, mul_v, zero;
+
+    zero = (vector float)vec_splat_u32(0);
+    mul_u.s[0] = mul;
+    mul_v = vec_splat(mul_u.v, 0);
+
+    for(i=0; i<len; i+=8) {
+        src1 = vec_ctf(vec_ld(0,  src+i), 0);
+        src2 = vec_ctf(vec_ld(16, src+i), 0);
+        dst1 = vec_madd(src1, mul_v, zero);
+        dst2 = vec_madd(src2, mul_v, zero);
+        vec_st(dst1,  0, dst+i);
+        vec_st(dst2, 16, dst+i);
+    }
+}
+
 
 static vector signed short
 float_to_int16_one_altivec(const float *src)
@@ -160,7 +222,7 @@ float_to_int16_one_altivec(const float *src)
     return vec_packs(t0,t1);
 }
 
-static void float_to_int16_altivec(int16_t *dst, const float *src, int len)
+static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
 {
     int i;
     vector signed short d0, d1, d;
@@ -240,7 +302,9 @@ void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
     c->vector_fmul = vector_fmul_altivec;
     c->vector_fmul_reverse = vector_fmul_reverse_altivec;
     c->vector_fmul_add_add = vector_fmul_add_add_altivec;
+    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
     if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+        c->vector_fmul_window = vector_fmul_window_altivec;
         c->float_to_int16 = float_to_int16_altivec;
         c->float_to_int16_interleave = float_to_int16_interleave_altivec;
     }
diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c
index ab4c5c1..b589be2 100644
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -19,6 +19,7 @@
  */
 
 #include "libavcodec/dsputil.h"
+#include "libavcodec/h264data.h"
 
 #include "gcc_fixes.h"
 
@@ -188,32 +189,32 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
                          ((8 - x) * (y)),
                              ((x) * (y))};
     register int i;
-    vec_u8_t fperm;
-    const vec_s32_t vABCD = vec_ld(0, ABCD);
-    const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
-    const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
-    const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
-    const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
+    vec_u8 fperm;
+    const vec_s32 vABCD = vec_ld(0, ABCD);
+    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
+    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
+    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
+    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
     LOAD_ZERO;
-    const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
-    const vec_u16_t v6us  = vec_splat_u16(6);
+    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
+    const vec_u16 v6us  = vec_splat_u16(6);
     register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
     register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 
-    vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
-    vec_u8_t vsrc0uc, vsrc1uc;
-    vec_s16_t vsrc0ssH, vsrc1ssH;
-    vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
-    vec_s16_t vsrc2ssH, vsrc3ssH, psum;
-    vec_u8_t vdst, ppsum, fsum;
+    vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
+    vec_u8 vsrc0uc, vsrc1uc;
+    vec_s16 vsrc0ssH, vsrc1ssH;
+    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
+    vec_s16 vsrc2ssH, vsrc3ssH, psum;
+    vec_u8 vdst, ppsum, fsum;
 
     if (((unsigned long)dst) % 16 == 0) {
-        fperm = (vec_u8_t){0x10, 0x11, 0x12, 0x13,
+        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
                            0x14, 0x15, 0x16, 0x17,
                            0x08, 0x09, 0x0A, 0x0B,
                            0x0C, 0x0D, 0x0E, 0x0F};
     } else {
-        fperm = (vec_u8_t){0x00, 0x01, 0x02, 0x03,
+        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
                            0x04, 0x05, 0x06, 0x07,
                            0x18, 0x19, 0x1A, 0x1B,
                            0x1C, 0x1D, 0x1E, 0x1F};
@@ -232,8 +233,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
     else
         vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
 
-    vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc);
-    vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc);
+    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
+    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
 
     if (!loadSecond) {// -> !reallyBadAlign
         for (i = 0 ; i < h ; i++) {
@@ -244,8 +245,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
             vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
             vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
 
-            vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
-            vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
+            vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
+            vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
 
             psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
             psum = vec_mladd(vB, vsrc1ssH, psum);
@@ -255,7 +256,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
             psum = vec_sra(psum, v6us);
 
             vdst = vec_ld(0, dst);
-            ppsum = (vec_u8_t)vec_packsu(psum, psum);
+            ppsum = (vec_u8)vec_packsu(psum, psum);
             fsum = vec_perm(vdst, ppsum, fperm);
 
             vec_st(fsum, 0, dst);
@@ -267,7 +268,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
             src += stride;
         }
     } else {
-        vec_u8_t vsrcDuc;
+        vec_u8 vsrcDuc;
         for (i = 0 ; i < h ; i++) {
             vsrcCuc = vec_ld(stride + 0, src);
             vsrcDuc = vec_ld(stride + 16, src);
@@ -278,8 +279,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
             else
                 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
 
-            vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
-            vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
+            vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
+            vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
 
             psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
             psum = vec_mladd(vB, vsrc1ssH, psum);
@@ -289,7 +290,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
             psum = vec_sr(psum, v6us);
 
             vdst = vec_ld(0, dst);
-            ppsum = (vec_u8_t)vec_pack(psum, psum);
+            ppsum = (vec_u8)vec_pack(psum, psum);
             fsum = vec_perm(vdst, ppsum, fperm);
 
             vec_st(fsum, 0, dst);
@@ -308,7 +309,7 @@ static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
                                     int src_stride1, int h)
 {
     int i;
-    vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
+    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
 
     mask_ = vec_lvsl(0, src2);
 
@@ -350,7 +351,7 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
                                     int src_stride1, int h)
 {
     int i;
-    vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
+    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
 
     mask_ = vec_lvsl(0, src2);
 
@@ -431,23 +432,23 @@ H264_MC(avg_, 16, altivec)
 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va)                      \
     vdst_orig = vec_ld(0, dst);                               \
     vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);          \
-    vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst);         \
+    vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst);         \
     va = vec_add(va, vdst_ss);                                \
     va_u8 = vec_packsu(va, zero_s16v);                        \
-    va_u32 = vec_splat((vec_u32_t)va_u8, 0);                  \
+    va_u32 = vec_splat((vec_u32)va_u8, 0);                  \
     vec_ste(va_u32, element, (uint32_t*)dst);
 
 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
 {
-    vec_s16_t va0, va1, va2, va3;
-    vec_s16_t vz0, vz1, vz2, vz3;
-    vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3;
-    vec_u8_t va_u8;
-    vec_u32_t va_u32;
-    vec_s16_t vdst_ss;
-    const vec_u16_t v6us = vec_splat_u16(6);
-    vec_u8_t vdst, vdst_orig;
-    vec_u8_t vdst_mask = vec_lvsl(0, dst);
+    vec_s16 va0, va1, va2, va3;
+    vec_s16 vz0, vz1, vz2, vz3;
+    vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
+    vec_u8 va_u8;
+    vec_u32 va_u32;
+    vec_s16 vdst_ss;
+    const vec_u16 v6us = vec_splat_u16(6);
+    vec_u8 vdst, vdst_orig;
+    vec_u8 vdst_mask = vec_lvsl(0, dst);
     int element = ((unsigned long)dst & 0xf) >> 2;
     LOAD_ZERO;
 
@@ -478,40 +479,40 @@ static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
 
 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7) {\
     /*        a0  = SRC(0) + SRC(4); */ \
-    vec_s16_t a0v = vec_add(s0, s4);    \
+    vec_s16 a0v = vec_add(s0, s4);    \
     /*        a2  = SRC(0) - SRC(4); */ \
-    vec_s16_t a2v = vec_sub(s0, s4);    \
+    vec_s16 a2v = vec_sub(s0, s4);    \
     /*        a4  =           (SRC(2)>>1) - SRC(6); */ \
-    vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6);    \
+    vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6);    \
     /*        a6  =           (SRC(6)>>1) + SRC(2); */ \
-    vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2);    \
+    vec_s16 a6v = vec_add(vec_sra(s6, onev), s2);    \
     /*        b0  =         a0 + a6; */ \
-    vec_s16_t b0v = vec_add(a0v, a6v);  \
+    vec_s16 b0v = vec_add(a0v, a6v);  \
     /*        b2  =         a2 + a4; */ \
-    vec_s16_t b2v = vec_add(a2v, a4v);  \
+    vec_s16 b2v = vec_add(a2v, a4v);  \
     /*        b4  =         a2 - a4; */ \
-    vec_s16_t b4v = vec_sub(a2v, a4v);  \
+    vec_s16 b4v = vec_sub(a2v, a4v);  \
     /*        b6  =         a0 - a6; */ \
-    vec_s16_t b6v = vec_sub(a0v, a6v);  \
+    vec_s16 b6v = vec_sub(a0v, a6v);  \
     /* a1 =  SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
     /*        a1 =             (SRC(5)-SRC(3)) -  (SRC(7)  +  (SRC(7)>>1)); */ \
-    vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
+    vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
     /* a3 =  SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
     /*        a3 =             (SRC(7)+SRC(1)) -  (SRC(3)  +  (SRC(3)>>1)); */ \
-    vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
+    vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
     /* a5 =  SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
     /*        a5 =             (SRC(7)-SRC(1)) +   SRC(5) +   (SRC(5)>>1); */ \
-    vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
+    vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
     /*        a7 =                SRC(5)+SRC(3) +  SRC(1) +   (SRC(1)>>1); */ \
-    vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
+    vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
     /*        b1 =                  (a7>>2)  +  a1; */ \
-    vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \
+    vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \
     /*        b3 =          a3 +        (a5>>2); */ \
-    vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \
+    vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \
     /*        b5 =                  (a3>>2)  -   a5; */ \
-    vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \
+    vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \
     /*        b7 =           a7 -        (a1>>2); */ \
-    vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
+    vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
     /* DST(0,    b0 + b7); */ \
     d0 = vec_add(b0v, b7v); \
     /* DST(1,    b2 + b5); */ \
@@ -532,17 +533,17 @@ static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
 
 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
     /* unaligned load */                                       \
-    vec_u8_t hv = vec_ld( 0, dest );                           \
-    vec_u8_t lv = vec_ld( 7, dest );                           \
-    vec_u8_t dstv   = vec_perm( hv, lv, (vec_u8_t)perm_ldv );  \
-    vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                 \
-    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);   \
-    vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);  \
-    vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum);        \
-    vec_u8_t edgehv;                                           \
+    vec_u8 hv = vec_ld( 0, dest );                           \
+    vec_u8 lv = vec_ld( 7, dest );                           \
+    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );  \
+    vec_s16 idct_sh6 = vec_sra(idctv, sixv);                 \
+    vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv);   \
+    vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16);  \
+    vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum);        \
+    vec_u8 edgehv;                                           \
     /* unaligned store */                                      \
-    vec_u8_t bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
-    vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
+    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
+    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
     lv    = vec_sel( lv, bodyv, edgelv );                      \
     vec_st( lv, 7, dest );                                     \
     hv    = vec_ld( 0, dest );                                 \
@@ -552,18 +553,18 @@ static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
  }
 
 void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
-    vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
-    vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
-    vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
+    vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
+    vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
+    vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
 
-    vec_u8_t perm_ldv = vec_lvsl(0, dst);
-    vec_u8_t perm_stv = vec_lvsr(8, dst);
+    vec_u8 perm_ldv = vec_lvsl(0, dst);
+    vec_u8 perm_stv = vec_lvsr(8, dst);
 
-    const vec_u16_t onev = vec_splat_u16(1);
-    const vec_u16_t twov = vec_splat_u16(2);
-    const vec_u16_t sixv = vec_splat_u16(6);
+    const vec_u16 onev = vec_splat_u16(1);
+    const vec_u16 twov = vec_splat_u16(2);
+    const vec_u16 sixv = vec_splat_u16(6);
 
-    const vec_u8_t sel = (vec_u8_t) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
+    const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
     LOAD_ZERO;
 
     dct[0] += 32; // rounding for the >>6 at the end
@@ -595,11 +596,106 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
     ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
 }
 
+static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size)
+{
+    vec_s16 dc16;
+    vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
+    LOAD_ZERO;
+    DECLARE_ALIGNED_16(int, dc);
+    int i;
+
+    dc = (block[0] + 32) >> 6;
+    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
+
+    if (size == 4)
+        dc16 = vec_sld(dc16, zero_s16v, 8);
+    dcplus = vec_packsu(dc16, zero_s16v);
+    dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
+
+    aligner = vec_lvsr(0, dst);
+    dcplus = vec_perm(dcplus, dcplus, aligner);
+    dcminus = vec_perm(dcminus, dcminus, aligner);
+
+    for (i = 0; i < size; i += 4) {
+        v0 = vec_ld(0, dst+0*stride);
+        v1 = vec_ld(0, dst+1*stride);
+        v2 = vec_ld(0, dst+2*stride);
+        v3 = vec_ld(0, dst+3*stride);
+
+        v0 = vec_adds(v0, dcplus);
+        v1 = vec_adds(v1, dcplus);
+        v2 = vec_adds(v2, dcplus);
+        v3 = vec_adds(v3, dcplus);
+
+        v0 = vec_subs(v0, dcminus);
+        v1 = vec_subs(v1, dcminus);
+        v2 = vec_subs(v2, dcminus);
+        v3 = vec_subs(v3, dcminus);
+
+        vec_st(v0, 0, dst+0*stride);
+        vec_st(v1, 0, dst+1*stride);
+        vec_st(v2, 0, dst+2*stride);
+        vec_st(v3, 0, dst+3*stride);
+
+        dst += 4*stride;
+    }
+}
+
+static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
+{
+    h264_idct_dc_add_internal(dst, block, stride, 4);
+}
+
+static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
+{
+    h264_idct_dc_add_internal(dst, block, stride, 8);
+}
+
+static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
+        else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct8_add_altivec   (dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+        else if(block[i*16])
+            h264_idct_dc_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+    }
+}
+
 #define transpose4x16(r0, r1, r2, r3) {      \
-    register vec_u8_t r4;                    \
-    register vec_u8_t r5;                    \
-    register vec_u8_t r6;                    \
-    register vec_u8_t r7;                    \
+    register vec_u8 r4;                    \
+    register vec_u8 r5;                    \
+    register vec_u8 r6;                    \
+    register vec_u8 r7;                    \
                                              \
     r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
     r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
@@ -613,8 +709,8 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
 }
 
 static inline void write16x4(uint8_t *dst, int dst_stride,
-                             register vec_u8_t r0, register vec_u8_t r1,
-                             register vec_u8_t r2, register vec_u8_t r3) {
+                             register vec_u8 r0, register vec_u8 r1,
+                             register vec_u8 r2, register vec_u8 r3) {
     DECLARE_ALIGNED_16(unsigned char, result[64]);
     uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
     int int_dst_stride = dst_stride/4;
@@ -646,16 +742,16 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
     \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
     out of unaligned_load() */
 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
-    register vec_u8_t r0  = unaligned_load(0,             src);            \
-    register vec_u8_t r1  = unaligned_load(   src_stride, src);            \
-    register vec_u8_t r2  = unaligned_load(2* src_stride, src);            \
-    register vec_u8_t r3  = unaligned_load(3* src_stride, src);            \
-    register vec_u8_t r4  = unaligned_load(4* src_stride, src);            \
-    register vec_u8_t r5  = unaligned_load(5* src_stride, src);            \
-    register vec_u8_t r6  = unaligned_load(6* src_stride, src);            \
-    register vec_u8_t r7  = unaligned_load(7* src_stride, src);            \
-    register vec_u8_t r14 = unaligned_load(14*src_stride, src);            \
-    register vec_u8_t r15 = unaligned_load(15*src_stride, src);            \
+    register vec_u8 r0  = unaligned_load(0,             src);            \
+    register vec_u8 r1  = unaligned_load(   src_stride, src);            \
+    register vec_u8 r2  = unaligned_load(2* src_stride, src);            \
+    register vec_u8 r3  = unaligned_load(3* src_stride, src);            \
+    register vec_u8 r4  = unaligned_load(4* src_stride, src);            \
+    register vec_u8 r5  = unaligned_load(5* src_stride, src);            \
+    register vec_u8 r6  = unaligned_load(6* src_stride, src);            \
+    register vec_u8 r7  = unaligned_load(7* src_stride, src);            \
+    register vec_u8 r14 = unaligned_load(14*src_stride, src);            \
+    register vec_u8 r15 = unaligned_load(15*src_stride, src);            \
                                                                            \
     r8  = unaligned_load( 8*src_stride, src);                              \
     r9  = unaligned_load( 9*src_stride, src);                              \
@@ -705,26 +801,26 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
 }
 
 // out: o = |x-y| < a
-static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x,
-                                         register vec_u8_t y,
-                                         register vec_u8_t a) {
-
-    register vec_u8_t diff = vec_subs(x, y);
-    register vec_u8_t diffneg = vec_subs(y, x);
-    register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
-    o = (vec_u8_t)vec_cmplt(o, a);
+static inline vec_u8 diff_lt_altivec ( register vec_u8 x,
+                                         register vec_u8 y,
+                                         register vec_u8 a) {
+
+    register vec_u8 diff = vec_subs(x, y);
+    register vec_u8 diffneg = vec_subs(y, x);
+    register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */
+    o = (vec_u8)vec_cmplt(o, a);
     return o;
 }
 
-static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
-                                           register vec_u8_t p1,
-                                           register vec_u8_t q0,
-                                           register vec_u8_t q1,
-                                           register vec_u8_t alpha,
-                                           register vec_u8_t beta) {
+static inline vec_u8 h264_deblock_mask ( register vec_u8 p0,
+                                           register vec_u8 p1,
+                                           register vec_u8 q0,
+                                           register vec_u8 q1,
+                                           register vec_u8 alpha,
+                                           register vec_u8 beta) {
 
-    register vec_u8_t mask;
-    register vec_u8_t tempmask;
+    register vec_u8 mask;
+    register vec_u8 tempmask;
 
     mask = diff_lt_altivec(p0, q0, alpha);
     tempmask = diff_lt_altivec(p1, p0, beta);
@@ -736,19 +832,19 @@ static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
 }
 
 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
-static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
-                                       register vec_u8_t p1,
-                                       register vec_u8_t p2,
-                                       register vec_u8_t q0,
-                                       register vec_u8_t tc0) {
-
-    register vec_u8_t average = vec_avg(p0, q0);
-    register vec_u8_t temp;
-    register vec_u8_t uncliped;
-    register vec_u8_t ones;
-    register vec_u8_t max;
-    register vec_u8_t min;
-    register vec_u8_t newp1;
+static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
+                                       register vec_u8 p1,
+                                       register vec_u8 p2,
+                                       register vec_u8 q0,
+                                       register vec_u8 tc0) {
+
+    register vec_u8 average = vec_avg(p0, q0);
+    register vec_u8 temp;
+    register vec_u8 uncliped;
+    register vec_u8 ones;
+    register vec_u8 max;
+    register vec_u8 min;
+    register vec_u8 newp1;
 
     temp = vec_xor(average, p2);
     average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
@@ -764,16 +860,16 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
 
 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) {                                           \
                                                                                                   \
-    const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                               \
+    const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                               \
                                                                                                   \
-    register vec_u8_t pq0bit = vec_xor(p0,q0);                                                    \
-    register vec_u8_t q1minus;                                                                    \
-    register vec_u8_t p0minus;                                                                    \
-    register vec_u8_t stage1;                                                                     \
-    register vec_u8_t stage2;                                                                     \
-    register vec_u8_t vec160;                                                                     \
-    register vec_u8_t delta;                                                                      \
-    register vec_u8_t deltaneg;                                                                   \
+    register vec_u8 pq0bit = vec_xor(p0,q0);                                                    \
+    register vec_u8 q1minus;                                                                    \
+    register vec_u8 p0minus;                                                                    \
+    register vec_u8 stage1;                                                                     \
+    register vec_u8 stage2;                                                                     \
+    register vec_u8 vec160;                                                                     \
+    register vec_u8 delta;                                                                      \
+    register vec_u8 deltaneg;                                                                   \
                                                                                                   \
     q1minus = vec_nor(q1, q1);                 /* 255 - q1 */                                     \
     stage1 = vec_avg(p1, q1minus);             /* (p1 - q1 + 256)>>1 */                           \
@@ -796,16 +892,16 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
 
 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) {            \
     DECLARE_ALIGNED_16(unsigned char, temp[16]);                                             \
-    register vec_u8_t alphavec;                                                              \
-    register vec_u8_t betavec;                                                               \
-    register vec_u8_t mask;                                                                  \
-    register vec_u8_t p1mask;                                                                \
-    register vec_u8_t q1mask;                                                                \
+    register vec_u8 alphavec;                                                              \
+    register vec_u8 betavec;                                                               \
+    register vec_u8 mask;                                                                  \
+    register vec_u8 p1mask;                                                                \
+    register vec_u8 q1mask;                                                                \
     register vector signed   char tc0vec;                                                    \
-    register vec_u8_t finaltc0;                                                              \
-    register vec_u8_t tc0masked;                                                             \
-    register vec_u8_t newp1;                                                                 \
-    register vec_u8_t newq1;                                                                 \
+    register vec_u8 finaltc0;                                                              \
+    register vec_u8 tc0masked;                                                             \
+    register vec_u8 newp1;                                                                 \
+    register vec_u8 newq1;                                                                 \
                                                                                              \
     temp[0] = alpha;                                                                         \
     temp[1] = beta;                                                                          \
@@ -819,18 +915,18 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
     tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
     tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
     mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1)));  /* if tc0[i] >= 0 */         \
-    finaltc0 = vec_and((vec_u8_t)tc0vec, mask);     /* tc = tc0 */                           \
+    finaltc0 = vec_and((vec_u8)tc0vec, mask);     /* tc = tc0 */                           \
                                                                                              \
     p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
     p1mask = vec_and(p1mask, mask);                             /* if ( |p2 - p0| < beta) */ \
-    tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec);                                           \
+    tc0masked = vec_and(p1mask, (vec_u8)tc0vec);                                           \
     finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
     newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
     /*end if*/                                                                               \
                                                                                              \
     q1mask = diff_lt_altivec(q2, q0, betavec);                                               \
     q1mask = vec_and(q1mask, mask);                             /* if ( |q2 - q0| < beta ) */\
-    tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec);                                           \
+    tc0masked = vec_and(q1mask, (vec_u8)tc0vec);                                           \
     finaltc0 = vec_sub(finaltc0, q1mask);                       /* tc++ */                   \
     newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                      \
     /*end if*/                                                                               \
@@ -843,12 +939,12 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
 
     if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
-        register vec_u8_t p2 = vec_ld(-3*stride, pix);
-        register vec_u8_t p1 = vec_ld(-2*stride, pix);
-        register vec_u8_t p0 = vec_ld(-1*stride, pix);
-        register vec_u8_t q0 = vec_ld(0, pix);
-        register vec_u8_t q1 = vec_ld(stride, pix);
-        register vec_u8_t q2 = vec_ld(2*stride, pix);
+        register vec_u8 p2 = vec_ld(-3*stride, pix);
+        register vec_u8 p1 = vec_ld(-2*stride, pix);
+        register vec_u8 p0 = vec_ld(-1*stride, pix);
+        register vec_u8 q0 = vec_ld(0, pix);
+        register vec_u8 q1 = vec_ld(stride, pix);
+        register vec_u8 q2 = vec_ld(2*stride, pix);
         h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
         vec_st(p1, -2*stride, pix);
         vec_st(p0, -1*stride, pix);
@@ -859,7 +955,7 @@ static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
 
 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
 
-    register vec_u8_t line0, line1, line2, line3, line4, line5;
+    register vec_u8 line0, line1, line2, line3, line4, line5;
     if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
         return;
     readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
@@ -868,6 +964,130 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
     write16x4(pix-2, stride, line1, line2, line3, line4);
 }
 
+static av_always_inline
+void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
+{
+    int y, aligned;
+    vec_u8 vblock;
+    vec_s16 vtemp, vweight, voffset, v0, v1;
+    vec_u16 vlog2_denom;
+    DECLARE_ALIGNED_16(int32_t, temp[4]);
+    LOAD_ZERO;
+
+    offset <<= log2_denom;
+    if(log2_denom) offset += 1<<(log2_denom-1);
+    temp[0] = log2_denom;
+    temp[1] = weight;
+    temp[2] = offset;
+
+    vtemp = (vec_s16)vec_ld(0, temp);
+    vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
+    vweight = vec_splat(vtemp, 3);
+    voffset = vec_splat(vtemp, 5);
+    aligned = !((unsigned long)block & 0xf);
+
+    for (y=0; y<h; y++) {
+        vblock = vec_ld(0, block);
+
+        v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
+        v1 = (vec_s16)vec_mergel(zero_u8v, vblock);
+
+        if (w == 16 || aligned) {
+            v0 = vec_mladd(v0, vweight, zero_s16v);
+            v0 = vec_adds(v0, voffset);
+            v0 = vec_sra(v0, vlog2_denom);
+        }
+        if (w == 16 || !aligned) {
+            v1 = vec_mladd(v1, vweight, zero_s16v);
+            v1 = vec_adds(v1, voffset);
+            v1 = vec_sra(v1, vlog2_denom);
+        }
+        vblock = vec_packsu(v0, v1);
+        vec_st(vblock, 0, block);
+
+        block += stride;
+    }
+}
+
+static av_always_inline
+void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
+                               int weightd, int weights, int offset, int w, int h)
+{
+    int y, dst_aligned, src_aligned;
+    vec_u8 vsrc, vdst;
+    vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3;
+    vec_u16 vlog2_denom;
+    DECLARE_ALIGNED_16(int32_t, temp[4]);
+    LOAD_ZERO;
+
+    offset = ((offset + 1) | 1) << log2_denom;
+    temp[0] = log2_denom+1;
+    temp[1] = weights;
+    temp[2] = weightd;
+    temp[3] = offset;
+
+    vtemp = (vec_s16)vec_ld(0, temp);
+    vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
+    vweights = vec_splat(vtemp, 3);
+    vweightd = vec_splat(vtemp, 5);
+    voffset = vec_splat(vtemp, 7);
+    dst_aligned = !((unsigned long)dst & 0xf);
+    src_aligned = !((unsigned long)src & 0xf);
+
+    for (y=0; y<h; y++) {
+        vdst = vec_ld(0, dst);
+        vsrc = vec_ld(0, src);
+
+        v0 = (vec_s16)vec_mergeh(zero_u8v, vdst);
+        v1 = (vec_s16)vec_mergel(zero_u8v, vdst);
+        v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc);
+        v3 = (vec_s16)vec_mergel(zero_u8v, vsrc);
+
+        if (w == 8) {
+            if (src_aligned)
+                v3 = v2;
+            else
+                v2 = v3;
+        }
+
+        if (w == 16 || dst_aligned) {
+            v0 = vec_mladd(v0, vweightd, zero_s16v);
+            v2 = vec_mladd(v2, vweights, zero_s16v);
+
+            v0 = vec_adds(v0, voffset);
+            v0 = vec_adds(v0, v2);
+            v0 = vec_sra(v0, vlog2_denom);
+        }
+        if (w == 16 || !dst_aligned) {
+            v1 = vec_mladd(v1, vweightd, zero_s16v);
+            v3 = vec_mladd(v3, vweights, zero_s16v);
+
+            v1 = vec_adds(v1, voffset);
+            v1 = vec_adds(v1, v3);
+            v1 = vec_sra(v1, vlog2_denom);
+        }
+        vdst = vec_packsu(v0, v1);
+        vec_st(vdst, 0, dst);
+
+        dst += stride;
+        src += stride;
+    }
+}
+
+#define H264_WEIGHT(W,H) \
+static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
+    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
+}\
+static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16, 8)
+H264_WEIGHT( 8,16)
+H264_WEIGHT( 8, 8)
+H264_WEIGHT( 8, 4)
+
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
 
     if (has_altivec()) {
@@ -875,7 +1095,13 @@ void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
         c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
         c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
         c->h264_idct_add = ff_h264_idct_add_altivec;
+        c->h264_idct_add8 = ff_h264_idct_add8_altivec;
+        c->h264_idct_add16 = ff_h264_idct_add16_altivec;
+        c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec;
+        c->h264_idct_dc_add= h264_idct_dc_add_altivec;
+        c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec;
         c->h264_idct8_add = ff_h264_idct8_add_altivec;
+        c->h264_idct8_add4 = ff_h264_idct8_add4_altivec;
         c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
         c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
 
@@ -900,5 +1126,16 @@ void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
         dspfunc(put_h264_qpel, 0, 16);
         dspfunc(avg_h264_qpel, 0, 16);
 #undef dspfunc
+
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
+        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
+        c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
+        c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
+        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
+        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
+        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
     }
 }
diff --git a/libavcodec/ppc/h264_template_altivec.c b/libavcodec/ppc/h264_template_altivec.c
index e050fe5..5f722d0 100644
--- a/libavcodec/ppc/h264_template_altivec.c
+++ b/libavcodec/ppc/h264_template_altivec.c
@@ -28,8 +28,8 @@
 /* this code assume that stride % 16 == 0 */
 
 #define CHROMA_MC8_ALTIVEC_CORE \
-        vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\
-        vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\
+        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
+        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
 \
         psum = vec_mladd(vA, vsrc0ssH, v32ss);\
         psum = vec_mladd(vB, vsrc1ssH, psum);\
@@ -38,7 +38,7 @@
         psum = vec_sr(psum, v6us);\
 \
         vdst = vec_ld(0, dst);\
-        ppsum = (vec_u8_t)vec_pack(psum, psum);\
+        ppsum = (vec_u8)vec_pack(psum, psum);\
         vfdst = vec_perm(vdst, ppsum, fperm);\
 \
         OP_U8_ALTIVEC(fsum, vfdst, vdst);\
@@ -53,15 +53,15 @@
 
 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
 \
-        vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\
-        vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\
+        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
+        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
 \
         psum = vec_mladd(vA, vsrc0ssH, v32ss);\
         psum = vec_mladd(vE, vsrc1ssH, psum);\
         psum = vec_sr(psum, v6us);\
 \
         vdst = vec_ld(0, dst);\
-        ppsum = (vec_u8_t)vec_pack(psum, psum);\
+        ppsum = (vec_u8)vec_pack(psum, psum);\
         vfdst = vec_perm(vdst, ppsum, fperm);\
 \
         OP_U8_ALTIVEC(fsum, vfdst, vdst);\
@@ -80,34 +80,34 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                          ((8 - x) * (    y)),
                          ((    x) * (    y))};
     register int i;
-    vec_u8_t fperm;
-    const vec_s32_t vABCD = vec_ld(0, ABCD);
-    const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
-    const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
-    const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
-    const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
+    vec_u8 fperm;
+    const vec_s32 vABCD = vec_ld(0, ABCD);
+    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
+    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
+    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
+    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
     LOAD_ZERO;
-    const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
-    const vec_u16_t v6us = vec_splat_u16(6);
+    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
+    const vec_u16 v6us = vec_splat_u16(6);
     register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
     register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 
-    vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
-    vec_u8_t vsrc0uc, vsrc1uc;
-    vec_s16_t vsrc0ssH, vsrc1ssH;
-    vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
-    vec_s16_t vsrc2ssH, vsrc3ssH, psum;
-    vec_u8_t vdst, ppsum, vfdst, fsum;
+    vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
+    vec_u8 vsrc0uc, vsrc1uc;
+    vec_s16 vsrc0ssH, vsrc1ssH;
+    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
+    vec_s16 vsrc2ssH, vsrc3ssH, psum;
+    vec_u8 vdst, ppsum, vfdst, fsum;
 
   POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
 
     if (((unsigned long)dst) % 16 == 0) {
-        fperm = (vec_u8_t){0x10, 0x11, 0x12, 0x13,
+        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
                            0x14, 0x15, 0x16, 0x17,
                            0x08, 0x09, 0x0A, 0x0B,
                            0x0C, 0x0D, 0x0E, 0x0F};
     } else {
-        fperm = (vec_u8_t){0x00, 0x01, 0x02, 0x03,
+        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
                            0x04, 0x05, 0x06, 0x07,
                            0x18, 0x19, 0x1A, 0x1B,
                            0x1C, 0x1D, 0x1E, 0x1F};
@@ -126,8 +126,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
     else
         vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
 
-    vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
-    vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
+    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
+    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
 
     if (ABCD[3]) {
         if (!loadSecond) {// -> !reallyBadAlign
@@ -139,7 +139,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                 CHROMA_MC8_ALTIVEC_CORE
             }
         } else {
-            vec_u8_t vsrcDuc;
+            vec_u8 vsrcDuc;
             for (i = 0 ; i < h ; i++) {
                 vsrcCuc = vec_ld(stride + 0, src);
                 vsrcDuc = vec_ld(stride + 16, src);
@@ -153,7 +153,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
             }
         }
     } else {
-        const vec_s16_t vE = vec_add(vB, vC);
+        const vec_s16 vE = vec_add(vB, vC);
         if (ABCD[2]) { // x == 0 B == 0
             if (!loadSecond) {// -> !reallyBadAlign
                 for (i = 0 ; i < h ; i++) {
@@ -164,7 +164,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                     vsrc0uc = vsrc1uc;
                 }
             } else {
-                vec_u8_t vsrcDuc;
+                vec_u8 vsrcDuc;
                 for (i = 0 ; i < h ; i++) {
                     vsrcCuc = vec_ld(stride + 0, src);
                     vsrcDuc = vec_ld(stride + 15, src);
@@ -184,7 +184,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
                     CHROMA_MC8_ALTIVEC_CORE_SIMPLE
                 }
             } else {
-                vec_u8_t vsrcDuc;
+                vec_u8 vsrcDuc;
                 for (i = 0 ; i < h ; i++) {
                     vsrcCuc = vec_ld(0, src);
                     vsrcDuc = vec_ld(15, src);
@@ -210,35 +210,35 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
     register int i;
 
     LOAD_ZERO;
-    const vec_u8_t permM2 = vec_lvsl(-2, src);
-    const vec_u8_t permM1 = vec_lvsl(-1, src);
-    const vec_u8_t permP0 = vec_lvsl(+0, src);
-    const vec_u8_t permP1 = vec_lvsl(+1, src);
-    const vec_u8_t permP2 = vec_lvsl(+2, src);
-    const vec_u8_t permP3 = vec_lvsl(+3, src);
-    const vec_s16_t v5ss = vec_splat_s16(5);
-    const vec_u16_t v5us = vec_splat_u16(5);
-    const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
-    const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
-
-    vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+    const vec_u8 permM2 = vec_lvsl(-2, src);
+    const vec_u8 permM1 = vec_lvsl(-1, src);
+    const vec_u8 permP0 = vec_lvsl(+0, src);
+    const vec_u8 permP1 = vec_lvsl(+1, src);
+    const vec_u8 permP2 = vec_lvsl(+2, src);
+    const vec_u8 permP3 = vec_lvsl(+3, src);
+    const vec_s16 v5ss = vec_splat_s16(5);
+    const vec_u16 v5us = vec_splat_u16(5);
+    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
+    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
+
+    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
 
     register int align = ((((unsigned long)src) - 2) % 16);
 
-    vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
+    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
               srcP2A, srcP2B, srcP3A, srcP3B,
               srcM1A, srcM1B, srcM2A, srcM2B,
               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
               pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
               psumA, psumB, sumA, sumB;
 
-    vec_u8_t sum, vdst, fsum;
+    vec_u8 sum, vdst, fsum;
 
     POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
 
     for (i = 0 ; i < 16 ; i ++) {
-        vec_u8_t srcR1 = vec_ld(-2, src);
-        vec_u8_t srcR2 = vec_ld(14, src);
+        vec_u8 srcR1 = vec_ld(-2, src);
+        vec_u8 srcR2 = vec_ld(14, src);
 
         switch (align) {
         default: {
@@ -258,7 +258,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
             srcP3 = srcR2;
         } break;
         case 12: {
-            vec_u8_t srcR3 = vec_ld(30, src);
+            vec_u8 srcR3 = vec_ld(30, src);
             srcM2 = vec_perm(srcR1, srcR2, permM2);
             srcM1 = vec_perm(srcR1, srcR2, permM1);
             srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -267,7 +267,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
             srcP3 = vec_perm(srcR2, srcR3, permP3);
         } break;
         case 13: {
-            vec_u8_t srcR3 = vec_ld(30, src);
+            vec_u8 srcR3 = vec_ld(30, src);
             srcM2 = vec_perm(srcR1, srcR2, permM2);
             srcM1 = vec_perm(srcR1, srcR2, permM1);
             srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -276,7 +276,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
             srcP3 = vec_perm(srcR2, srcR3, permP3);
         } break;
         case 14: {
-            vec_u8_t srcR3 = vec_ld(30, src);
+            vec_u8 srcR3 = vec_ld(30, src);
             srcM2 = vec_perm(srcR1, srcR2, permM2);
             srcM1 = vec_perm(srcR1, srcR2, permM1);
             srcP0 = srcR2;
@@ -285,7 +285,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
             srcP3 = vec_perm(srcR2, srcR3, permP3);
         } break;
         case 15: {
-            vec_u8_t srcR3 = vec_ld(30, src);
+            vec_u8 srcR3 = vec_ld(30, src);
             srcM2 = vec_perm(srcR1, srcR2, permM2);
             srcM1 = srcR2;
             srcP0 = vec_perm(srcR2, srcR3, permP0);
@@ -295,20 +295,20 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
         } break;
         }
 
-        srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
-        srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
-        srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
-        srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
+        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
+        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
+        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
+        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
 
-        srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
-        srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
-        srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
-        srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
+        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
+        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
+        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
+        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
 
-        srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
-        srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
-        srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
-        srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
+        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
+        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
+        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
+        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
 
         sum1A = vec_adds(srcP0A, srcP1A);
         sum1B = vec_adds(srcP0B, srcP1B);
@@ -354,52 +354,52 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
     register int i;
 
     LOAD_ZERO;
-    const vec_u8_t perm = vec_lvsl(0, src);
-    const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
-    const vec_u16_t v5us = vec_splat_u16(5);
-    const vec_s16_t v5ss = vec_splat_s16(5);
-    const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
+    const vec_u8 perm = vec_lvsl(0, src);
+    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
+    const vec_u16 v5us = vec_splat_u16(5);
+    const vec_s16 v5ss = vec_splat_s16(5);
+    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
 
     uint8_t *srcbis = src - (srcStride * 2);
 
-    const vec_u8_t srcM2a = vec_ld(0, srcbis);
-    const vec_u8_t srcM2b = vec_ld(16, srcbis);
-    const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
+    const vec_u8 srcM2a = vec_ld(0, srcbis);
+    const vec_u8 srcM2b = vec_ld(16, srcbis);
+    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
     //srcbis += srcStride;
-    const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
-    const vec_u8_t srcM1b = vec_ld(16, srcbis);
-    const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
+    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
+    const vec_u8 srcM1b = vec_ld(16, srcbis);
+    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
     //srcbis += srcStride;
-    const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
-    const vec_u8_t srcP0b = vec_ld(16, srcbis);
-    const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
+    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
+    const vec_u8 srcP0b = vec_ld(16, srcbis);
+    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
     //srcbis += srcStride;
-    const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
-    const vec_u8_t srcP1b = vec_ld(16, srcbis);
-    const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
+    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
+    const vec_u8 srcP1b = vec_ld(16, srcbis);
+    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
     //srcbis += srcStride;
-    const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
-    const vec_u8_t srcP2b = vec_ld(16, srcbis);
-    const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
+    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
+    const vec_u8 srcP2b = vec_ld(16, srcbis);
+    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
     //srcbis += srcStride;
 
-    vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
-    vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
-    vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
-    vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
-    vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
-    vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
-    vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
-    vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
-    vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
-    vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
-
-    vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
+    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
+    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
+    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
+    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
+    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
+    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
+    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
+    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
+    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
+    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
+
+    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
               psumA, psumB, sumA, sumB,
               srcP3ssA, srcP3ssB,
               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
 
-    vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
+    vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
 
     POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
 
@@ -407,8 +407,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
         srcP3a = vec_ld(0, srcbis += srcStride);
         srcP3b = vec_ld(16, srcbis);
         srcP3 = vec_perm(srcP3a, srcP3b, perm);
-        srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
-        srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
+        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
+        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
         //srcbis += srcStride;
 
         sum1A = vec_adds(srcP0ssA, srcP1ssA);
@@ -463,49 +463,49 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
     POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
     register int i;
     LOAD_ZERO;
-    const vec_u8_t permM2 = vec_lvsl(-2, src);
-    const vec_u8_t permM1 = vec_lvsl(-1, src);
-    const vec_u8_t permP0 = vec_lvsl(+0, src);
-    const vec_u8_t permP1 = vec_lvsl(+1, src);
-    const vec_u8_t permP2 = vec_lvsl(+2, src);
-    const vec_u8_t permP3 = vec_lvsl(+3, src);
-    const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
-    const vec_u32_t v10ui = vec_splat_u32(10);
-    const vec_s16_t v5ss = vec_splat_s16(5);
-    const vec_s16_t v1ss = vec_splat_s16(1);
-    const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
-    const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
+    const vec_u8 permM2 = vec_lvsl(-2, src);
+    const vec_u8 permM1 = vec_lvsl(-1, src);
+    const vec_u8 permP0 = vec_lvsl(+0, src);
+    const vec_u8 permP1 = vec_lvsl(+1, src);
+    const vec_u8 permP2 = vec_lvsl(+2, src);
+    const vec_u8 permP3 = vec_lvsl(+3, src);
+    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
+    const vec_u32 v10ui = vec_splat_u32(10);
+    const vec_s16 v5ss = vec_splat_s16(5);
+    const vec_s16 v1ss = vec_splat_s16(1);
+    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
+    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
 
     register int align = ((((unsigned long)src) - 2) % 16);
 
-    vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
+    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
               srcP2A, srcP2B, srcP3A, srcP3B,
               srcM1A, srcM1B, srcM2A, srcM2B,
               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
               pp1A, pp1B, pp2A, pp2B, psumA, psumB;
 
-    const vec_u8_t mperm = (const vec_u8_t)
+    const vec_u8 mperm = (const vec_u8)
         {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
          0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
     int16_t *tmpbis = tmp;
 
-    vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
+    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
               tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
               tmpP2ssA, tmpP2ssB;
 
-    vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
+    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
               pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
               pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
               ssumAe, ssumAo, ssumBe, ssumBo;
-    vec_u8_t fsum, sumv, sum, vdst;
-    vec_s16_t ssume, ssumo;
+    vec_u8 fsum, sumv, sum, vdst;
+    vec_s16 ssume, ssumo;
 
     POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
     src -= (2 * srcStride);
     for (i = 0 ; i < 21 ; i ++) {
-        vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
-        vec_u8_t srcR1 = vec_ld(-2, src);
-        vec_u8_t srcR2 = vec_ld(14, src);
+        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+        vec_u8 srcR1 = vec_ld(-2, src);
+        vec_u8 srcR2 = vec_ld(14, src);
 
         switch (align) {
         default: {
@@ -525,7 +525,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
             srcP3 = srcR2;
         } break;
         case 12: {
-            vec_u8_t srcR3 = vec_ld(30, src);
+            vec_u8 srcR3 = vec_ld(30, src);
             srcM2 = vec_perm(srcR1, srcR2, permM2);
             srcM1 = vec_perm(srcR1, srcR2, permM1);
             srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -534,7 +534,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
             srcP3 = vec_perm(srcR2, srcR3, permP3);
         } break;
         case 13: {
-            vec_u8_t srcR3 = vec_ld(30, src);
+            vec_u8 srcR3 = vec_ld(30, src);
             srcM2 = vec_perm(srcR1, srcR2, permM2);
             srcM1 = vec_perm(srcR1, srcR2, permM1);
             srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -543,7 +543,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
             srcP3 = vec_perm(srcR2, srcR3, permP3);
         } break;
         case 14: {
-            vec_u8_t srcR3 = vec_ld(30, src);
+            vec_u8 srcR3 = vec_ld(30, src);
             srcM2 = vec_perm(srcR1, srcR2, permM2);
             srcM1 = vec_perm(srcR1, srcR2, permM1);
             srcP0 = srcR2;
@@ -552,7 +552,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
             srcP3 = vec_perm(srcR2, srcR3, permP3);
         } break;
         case 15: {
-            vec_u8_t srcR3 = vec_ld(30, src);
+            vec_u8 srcR3 = vec_ld(30, src);
             srcM2 = vec_perm(srcR1, srcR2, permM2);
             srcM1 = srcR2;
             srcP0 = vec_perm(srcR2, srcR3, permP0);
@@ -562,20 +562,20 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
         } break;
         }
 
-        srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
-        srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
-        srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
-        srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
+        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
+        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
+        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
+        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
 
-        srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
-        srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
-        srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
-        srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
+        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
+        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
+        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
+        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
 
-        srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
-        srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
-        srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
-        srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
+        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
+        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
+        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
+        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
 
         sum1A = vec_adds(srcP0A, srcP1A);
         sum1B = vec_adds(srcP0B, srcP1B);
@@ -617,15 +617,15 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
     tmpbis += tmpStride;
 
     for (i = 0 ; i < 16 ; i++) {
-        const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
-        const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
+        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
+        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
 
-        const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
-        const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
-        const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
-        const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
-        const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
-        const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
+        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
+        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
+        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
+        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
+        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
+        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
 
         tmpbis += tmpStride;
 
@@ -650,9 +650,9 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
         pp2Be = vec_mule(sum2B, v5ss);
         pp2Bo = vec_mulo(sum2B, v5ss);
 
-        pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
+        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
         pp3Ao = vec_mulo(sum3A, v1ss);
-        pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
+        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
         pp3Bo = vec_mulo(sum3B, v1ss);
 
         pp1cAe = vec_add(pp1Ae, v512si);
diff --git a/libavcodec/ppc/idct_altivec.c b/libavcodec/ppc/idct_altivec.c
index 7acef48..94b6598 100644
--- a/libavcodec/ppc/idct_altivec.c
+++ b/libavcodec/ppc/idct_altivec.c
@@ -40,17 +40,9 @@
 #include "libavcodec/dsputil.h"
 
 #include "gcc_fixes.h"
-
+#include "types_altivec.h"
 #include "dsputil_ppc.h"
 
-#define vector_s16_t vector signed short
-#define const_vector_s16_t const vector signed short
-#define vector_u16_t vector unsigned short
-#define vector_s8_t vector signed char
-#define vector_u8_t vector unsigned char
-#define vector_s32_t vector signed int
-#define vector_u32_t vector unsigned int
-
 #define IDCT_HALF                                       \
     /* 1st stage */                                     \
     t1 = vec_mradds (a1, vx7, vx1 );                    \
@@ -88,11 +80,11 @@
 
 
 #define IDCT                                                            \
-    vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;                \
-    vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                \
-    vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias;                  \
-    vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8;                    \
-    vector_u16_t shift;                                                 \
+    vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;                \
+    vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                \
+    vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias;                  \
+    vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8;                    \
+    vec_u16 shift;                                                 \
                                                                         \
     c4 = vec_splat (constants[0], 0);                                   \
     a0 = vec_splat (constants[0], 1);                                   \
@@ -100,7 +92,7 @@
     a2 = vec_splat (constants[0], 3);                                   \
     mc4 = vec_splat (constants[0], 4);                                  \
     ma2 = vec_splat (constants[0], 5);                                  \
-    bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3);     \
+    bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3);     \
                                                                         \
     zero = vec_splat_s16 (0);                                           \
     shift = vec_splat_u16 (4);                                          \
@@ -156,7 +148,7 @@
     vx7 = vec_sra (vy7, shift);
 
 
-static const_vector_s16_t constants[5] = {
+static const vec_s16 constants[5] = {
     {23170, 13573,  6518, 21895, -23170, -21895,    32,    31},
     {16384, 22725, 21407, 19266,  16384,  19266, 21407, 22725},
     {22725, 31521, 29692, 26722,  22725,  26722, 29692, 31521},
@@ -164,10 +156,10 @@ static const_vector_s16_t constants[5] = {
     {19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722}
 };
 
-void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block)
+void idct_put_altivec(uint8_t* dest, int stride, vec_s16* block)
 {
 POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
-    vector_u8_t tmp;
+    vec_u8 tmp;
 
 #ifdef CONFIG_POWERPC_PERF
 POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
@@ -176,8 +168,8 @@ POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
 
 #define COPY(dest,src)                                          \
     tmp = vec_packsu (src, src);                                \
-    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);       \
-    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+    vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);       \
+    vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
 
     COPY (dest, vx0)    dest += stride;
     COPY (dest, vx1)    dest += stride;
@@ -191,14 +183,14 @@ POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
 POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
 }
 
-void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block)
+void idct_add_altivec(uint8_t* dest, int stride, vec_s16* block)
 {
 POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
-    vector_u8_t tmp;
-    vector_s16_t tmp2, tmp3;
-    vector_u8_t perm0;
-    vector_u8_t perm1;
-    vector_u8_t p0, p1, p;
+    vec_u8 tmp;
+    vec_s16 tmp2, tmp3;
+    vec_u8 perm0;
+    vec_u8 perm1;
+    vec_u8 p0, p1, p;
 
 #ifdef CONFIG_POWERPC_PERF
 POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
@@ -215,11 +207,11 @@ POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
 #define ADD(dest,src,perm)                                              \
     /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
     tmp = vec_ld (0, dest);                                             \
-    tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm);       \
+    tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm);       \
     tmp3 = vec_adds (tmp2, src);                                        \
     tmp = vec_packsu (tmp3, tmp3);                                      \
-    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);               \
-    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
+    vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);               \
+    vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
 
     ADD (dest, vx0, perm0)      dest += stride;
     ADD (dest, vx1, perm1)      dest += stride;
diff --git a/libavcodec/ppc/imgresample_altivec.c b/libavcodec/ppc/imgresample_altivec.c
index b38e41b..fdbca5c 100644
--- a/libavcodec/ppc/imgresample_altivec.c
+++ b/libavcodec/ppc/imgresample_altivec.c
@@ -24,17 +24,13 @@
  * High quality image resampling with polyphase filters - AltiVec bits
  */
 
-#include "gcc_fixes.h"
-
-typedef         union {
-    vector unsigned char v;
-    unsigned char c[16];
-} vec_uc_t;
+#include "util_altivec.h"
+#define FILTER_BITS   8
 
 typedef         union {
     vector signed short v;
     signed short s[8];
-} vec_ss_t;
+} vec_ss;
 
 void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
                           int wrap, int16_t *filter)
@@ -42,7 +38,7 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
     int sum, i;
     const uint8_t *s;
     vector unsigned char *tv, tmp, dstv, zero;
-    vec_ss_t srchv[4], srclv[4], fv[4];
+    vec_ss srchv[4], srclv[4], fv[4];
     vector signed short zeros, sumhv, sumlv;
     s = src;
 
diff --git a/libavcodec/ppc/int_altivec.c b/libavcodec/ppc/int_altivec.c
index 8bd3936..5cadea2 100644
--- a/libavcodec/ppc/int_altivec.c
+++ b/libavcodec/ppc/int_altivec.c
@@ -79,10 +79,10 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
 static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
 {
     int i;
-    register vec_s16_t vec, *pv;
+    register vec_s16 vec, *pv;
 
     for(i = 0; i < order; i += 8){
-        pv = (vec_s16_t*)v2;
+        pv = (vec_s16*)v2;
         vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
         vec_st(vec_add(vec_ld(0, v1), vec), 0, v1);
         v1 += 8;
@@ -93,10 +93,10 @@ static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
 static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
 {
     int i;
-    register vec_s16_t vec, *pv;
+    register vec_s16 vec, *pv;
 
     for(i = 0; i < order; i += 8){
-        pv = (vec_s16_t*)v2;
+        pv = (vec_s16*)v2;
         vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
         vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
         v1 += 8;
@@ -108,9 +108,9 @@ static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order
 {
     int i;
     LOAD_ZERO;
-    register vec_s16_t vec1, *pv;
-    register vec_s32_t res = vec_splat_s32(0), t;
-    register vec_u32_t shifts;
+    register vec_s16 vec1, *pv;
+    register vec_s32 res = vec_splat_s32(0), t;
+    register vec_u32 shifts;
     DECLARE_ALIGNED_16(int32_t, ires);
 
     shifts = zero_u32v;
@@ -121,7 +121,7 @@ static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order
     if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));
 
     for(i = 0; i < order; i += 8){
-        pv = (vec_s16_t*)v1;
+        pv = (vec_s16*)v1;
         vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
         t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
         t = vec_sr(t, shifts);
diff --git a/libavcodec/ppc/mathops.h b/libavcodec/ppc/mathops.h
index 776ee62..edfe2ea 100644
--- a/libavcodec/ppc/mathops.h
+++ b/libavcodec/ppc/mathops.h
@@ -23,7 +23,7 @@
 #ifndef AVCODEC_PPC_MATHOPS_H
 #define AVCODEC_PPC_MATHOPS_H
 
-#if defined(ARCH_POWERPC_405)
+#if defined(ARCH_PPC_405)
 /* signed 16x16 -> 32 multiply add accumulate */
 #define MAC16(rt, ra, rb) \
     __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c
index ba1719f..28ada7a 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -585,8 +585,8 @@ POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
 }
 
 
-extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
-extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
+void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
+void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
 
 void MPV_common_init_altivec(MpegEncContext *s)
 {
diff --git a/libavcodec/ppc/types_altivec.h b/libavcodec/ppc/types_altivec.h
index 30963c2..2870e83 100644
--- a/libavcodec/ppc/types_altivec.h
+++ b/libavcodec/ppc/types_altivec.h
@@ -24,23 +24,23 @@
 /***********************************************************************
  * Vector types
  **********************************************************************/
-#define vec_u8_t  vector unsigned char
-#define vec_s8_t  vector signed char
-#define vec_u16_t vector unsigned short
-#define vec_s16_t vector signed short
-#define vec_u32_t vector unsigned int
-#define vec_s32_t vector signed int
+#define vec_u8  vector unsigned char
+#define vec_s8  vector signed char
+#define vec_u16 vector unsigned short
+#define vec_s16 vector signed short
+#define vec_u32 vector unsigned int
+#define vec_s32 vector signed int
 
 /***********************************************************************
  * Null vector
  **********************************************************************/
-#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
+#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 )
 
-#define zero_u8v  (vec_u8_t)  zerov
-#define zero_s8v  (vec_s8_t)  zerov
-#define zero_u16v (vec_u16_t) zerov
-#define zero_s16v (vec_s16_t) zerov
-#define zero_u32v (vec_u32_t) zerov
-#define zero_s32v (vec_s32_t) zerov
+#define zero_u8v  (vec_u8)  zerov
+#define zero_s8v  (vec_s8)  zerov
+#define zero_u16v (vec_u16) zerov
+#define zero_s16v (vec_s16) zerov
+#define zero_u32v (vec_u32) zerov
+#define zero_s32v (vec_s32) zerov
 
 #endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */
diff --git a/libavcodec/pthread.c b/libavcodec/pthread.c
index 71f2da6..e178993 100644
--- a/libavcodec/pthread.c
+++ b/libavcodec/pthread.c
@@ -25,11 +25,11 @@
 
 #include "avcodec.h"
 
-typedef int (action_t)(AVCodecContext *c, void *arg);
+typedef int (action_func)(AVCodecContext *c, void *arg);
 
 typedef struct ThreadContext {
     pthread_t *workers;
-    action_t *func;
+    action_func *func;
     void *args;
     int *rets;
     int rets_count;
@@ -101,7 +101,7 @@ void avcodec_thread_free(AVCodecContext *avctx)
     av_freep(&avctx->thread_opaque);
 }
 
-int avcodec_thread_execute(AVCodecContext *avctx, action_t* func, void *arg, int *ret, int job_count, int job_size)
+int avcodec_thread_execute(AVCodecContext *avctx, action_func* func, void *arg, int *ret, int job_count, int job_size)
 {
     ThreadContext *c= avctx->thread_opaque;
     int dummy_ret;
diff --git a/libavcodec/qcelp_lsp.c b/libavcodec/qcelp_lsp.c
new file mode 100644
index 0000000..a6c6595
--- /dev/null
+++ b/libavcodec/qcelp_lsp.c
@@ -0,0 +1,102 @@
+/*
+ * QCELP decoder
+ * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file qcelp_lsp.c
+ * QCELP decoder
+ * @author Reynaldo H. Verdejo Pinochet
+ * @remark FFmpeg merging spearheaded by Kenan Gillet
+ * @remark Development mentored by Benjamin Larson
+ */
+
+#include "libavutil/mathematics.h"
+
+/**
+ * initial coefficient to perform bandwidth expansion on LPC
+ *
+ * @note: 0.9883 looks like an approximation of 253/256.
+ *
+ * TIA/EIA/IS-733 2.4.3.3.6 6
+ */
+#define QCELP_BANDWITH_EXPANSION_COEFF 0.9883
+
+/**
+ * Computes the Pa / (1 + z(-1)) or Qa / (1 - z(-1)) coefficients
+ * needed for LSP to LPC conversion.
+ * We only need to calculate the 6 first elements of the polynomial.
+ *
+ * @param lspf line spectral pair frequencies
+ * @param f [out] polynomial input/output as a vector
+ *
+ * TIA/EIA/IS-733 2.4.3.3.5-1/2
+ */
+static void lsp2polyf(const float *lspf, double *f, int lp_half_order)
+{
+    int i, j;
+
+    f[0] = 1.0;
+    f[1] = -2 * cos(M_PI * lspf[0]);
+    lspf -= 2;
+    for(i=2; i<=lp_half_order; i++)
+    {
+        double val = -2 * cos(M_PI * lspf[2*i]);
+        f[i] = val * f[i-1] + 2*f[i-2];
+        for(j=i-1; j>1; j--)
+            f[j] += f[j-1] * val + f[j-2];
+        f[1] += val;
+    }
+}
+
+/**
+ * Reconstructs LPC coefficients from the line spectral pair frequencies
+ * and performs bandwidth expansion.
+ *
+ * @param lspf line spectral pair frequencies
+ * @param lpc linear predictive coding coefficients
+ *
+ * @note: bandwith_expansion_coeff could be precalculated into a table
+ *        but it seems to be slower on x86
+ *
+ * TIA/EIA/IS-733 2.4.3.3.5
+ */
+void ff_qcelp_lspf2lpc(const float *lspf, float *lpc)
+{
+    double pa[6], qa[6];
+    int   i;
+    double bandwith_expansion_coeff = -QCELP_BANDWITH_EXPANSION_COEFF * 0.5;
+
+    lsp2polyf(lspf,     pa, 5);
+    lsp2polyf(lspf + 1, qa, 5);
+
+    for (i=4; i>=0; i--)
+    {
+        double paf = pa[i+1] + pa[i];
+        double qaf = qa[i+1] - qa[i];
+
+        lpc[i  ] = paf + qaf;
+        lpc[9-i] = paf - qaf;
+    }
+    for (i=0; i<10; i++)
+    {
+        lpc[i] *= bandwith_expansion_coeff;
+        bandwith_expansion_coeff *= QCELP_BANDWITH_EXPANSION_COEFF;
+    }
+}
diff --git a/libavcodec/qcelpdata.h b/libavcodec/qcelpdata.h
index 856f11d..9d5915b 100644
--- a/libavcodec/qcelpdata.h
+++ b/libavcodec/qcelpdata.h
@@ -1,5 +1,5 @@
 /*
- * part of QCELP decoder
+ * QCELP decoder
  * Copyright (c) 2007 Reynaldo H. Verdejo Pinochet
  *
  * This file is part of FFmpeg.
@@ -22,7 +22,48 @@
 #ifndef AVCODEC_QCELPDATA_H
 #define AVCODEC_QCELPDATA_H
 
+/**
+ * @file qcelpdata.h
+ * Data tables for the QCELP decoder
+ * @author Reynaldo H. Verdejo Pinochet
+ * @remark FFmpeg merging spearheaded by Kenan Gillet
+ * @remark Development mentored by Benjamin Larson
+ */
+
+#include <stddef.h>
 #include <stdint.h>
+#include "libavutil/common.h"
+
+/**
+ * QCELP unpacked data frame
+ */
+typedef struct {
+/// @defgroup qcelp_codebook_parameters QCELP excitation codebook parameters
+/// @{
+    uint8_t cbsign[16]; ///!< sign of the codebook gain for each codebook subframe
+    uint8_t cbgain[16]; ///!< unsigned codebook gain for each codebook subframe
+    uint8_t cindex[16]; ///!< codebook index for each codebook subframe
+/// @}
+
+/// @defgroup qcelp_pitch_parameters QCELP pitch prediction parameters
+/// @{
+    uint8_t plag[4];    ///!< pitch lag for each pitch subframe
+    uint8_t pfrac[4];   ///!< fractional pitch lag for each pitch subframe
+    uint8_t pgain[4];   ///!< pitch gain for each pitch subframe
+/// @}
+
+    /**
+     * line spectral pair frequencies (LSP) for RATE_OCTAVE,
+     * line spectral pair frequencies grouped into five vectors
+     * of dimension two (LSPV) for other rates
+     */
+    uint8_t lspv[10];
+
+    /**
+     * reserved bits only present in bitrate 1, 1/4 and 1/8 packets
+     */
+    uint8_t reserved;
+} QCELPFrame;
 
 /**
  * pre-calculated table for hammsinc function
@@ -38,7 +79,7 @@ typedef struct {
     uint8_t bitlen; /*!< number of bits to read */
 } QCELPBitmap;
 
-#define QCELP_OF(variable, bit, len) {offsetof(QCELPContext, variable), bit, len}
+#define QCELP_OF(variable, bit, len) {offsetof(QCELPFrame, variable), bit, len}
 
 /**
  * bitmap unpacking tables for RATE_FULL
@@ -232,7 +273,7 @@ static const QCELPBitmap * const qcelp_unpacking_bitmaps_per_rate[5] = {
     qcelp_rate_full_bitmap,
 };
 
-static const uint16_t qcelp_bits_per_rate[5] = {
+static const uint16_t qcelp_unpacking_bitmaps_lengths[5] = {
     0, ///!< for SILENCE rate
     FF_ARRAY_ELEMS(qcelp_rate_octave_bitmap),
     FF_ARRAY_ELEMS(qcelp_rate_quarter_bitmap),
@@ -384,6 +425,16 @@ static const qcelp_vector * const qcelp_lspvq[5] = {
 #define QCELP_SCALE 8192.
 
 /**
+ * the upper boundary of the clipping, depends on QCELP_SCALE
+ */
+#define QCELP_CLIP_UPPER_BOUND (8191.75/8192.)
+
+/**
+ * the lower boundary of the clipping, depends on QCELP_SCALE
+ */
+#define QCELP_CLIP_LOWER_BOUND -1.
+
+/**
  * table for computing Ga (decoded linear codebook gain magnitude)
  *
  * @note The table could fit in int16_t in x*8 form, but it seems
@@ -406,7 +457,7 @@ static const float qcelp_g12ga[61] = {
   100.000/QCELP_SCALE, 112.250/QCELP_SCALE, 125.875/QCELP_SCALE, 141.250/QCELP_SCALE,
   158.500/QCELP_SCALE, 177.875/QCELP_SCALE, 199.500/QCELP_SCALE, 223.875/QCELP_SCALE,
   251.250/QCELP_SCALE, 281.875/QCELP_SCALE, 316.250/QCELP_SCALE, 354.875/QCELP_SCALE,
-  398.125/QCELP_SCALE, 446.625/QCELP_SCALE, 501.125/QCELP_SCALE, 563.375/QCELP_SCALE,
+  398.125/QCELP_SCALE, 446.625/QCELP_SCALE, 501.125/QCELP_SCALE, 562.375/QCELP_SCALE,
   631.000/QCELP_SCALE, 708.000/QCELP_SCALE, 794.375/QCELP_SCALE, 891.250/QCELP_SCALE,
  1000.000/QCELP_SCALE};
 
@@ -463,7 +514,7 @@ static const int8_t qcelp_rate_half_codebook[128] = {
 /**
  * sqrt(1.887) is the maximum of the pseudorandom
  * white sequence used to generate the scaled codebook
- * vector for framerate 1/4.
+ * vector for bitrate 1/4.
  *
  * TIA/EIA/IS-733 2.4.8.1.2
  */
@@ -471,9 +522,9 @@ static const int8_t qcelp_rate_half_codebook[128] = {
 
 /**
  * table for impulse response of BPF used to filter
- * the white excitation for framerate 1/4 synthesis
+ * the white excitation for bitrate 1/4 synthesis
  *
- * Only half the tables are needed because of symetry.
+ * Only half the tables are needed because of symmetry.
  *
  * TIA/EIA/IS-733 2.4.8.1.2-1.1
  */
@@ -483,4 +534,20 @@ static const double qcelp_rnd_fir_coefs[11] = {
   -9.918777e-2, 3.749518e-2,  8.985137e-1
 };
 
+/**
+ * This spread factor is used, for bitrate 1/8 and I_F_Q,
+ * to force the LSP frequencies to be at least 80 Hz apart.
+ *
+ * TIA/EIA/IS-733 2.4.3.3.2
+ */
+#define QCELP_LSP_SPREAD_FACTOR 0.02
+
+/**
+ * predictor coefficient for the conversion of LSP codes
+ * to LSP frequencies for 1/8 and I_F_Q
+ *
+ * TIA/EIA/IS-733 2.4.3.2.7-2
+ */
+#define QCELP_LSP_OCTAVE_PREDICTOR 29.0/32
+
 #endif /* AVCODEC_QCELPDATA_H */
diff --git a/libavcodec/qcelpdec.c b/libavcodec/qcelpdec.c
index c0bf2cc..c65b094 100644
--- a/libavcodec/qcelpdec.c
+++ b/libavcodec/qcelpdec.c
@@ -18,18 +18,21 @@
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
+
 /**
  * @file qcelpdec.c
  * QCELP decoder
  * @author Reynaldo H. Verdejo Pinochet
+ * @remark FFmpeg merging spearheaded by Kenan Gillet
+ * @remark Development mentored by Benjamin Larson
  */
 
 #include <stddef.h>
 
 #include "avcodec.h"
+#include "internal.h"
 #include "bitstream.h"
 
-#include "qcelp.h"
 #include "qcelpdata.h"
 
 #include "celp_math.h"
@@ -38,15 +41,53 @@
 #undef NDEBUG
 #include <assert.h>
 
-static void weighted_vector_sumf(float *out,
-                                 const float *in_a,
-                                 const float *in_b,
-                                 float weight_coeff_a,
-                                 float weight_coeff_b,
-                                 int length) {
-    int   i;
+typedef enum
+{
+    I_F_Q = -1,    /*!< insufficient frame quality */
+    SILENCE,
+    RATE_OCTAVE,
+    RATE_QUARTER,
+    RATE_HALF,
+    RATE_FULL
+} qcelp_packet_rate;
+
+typedef struct
+{
+    GetBitContext     gb;
+    qcelp_packet_rate bitrate;
+    QCELPFrame        frame;    /*!< unpacked data frame */
+
+    uint8_t  erasure_count;
+    uint8_t  octave_count;      /*!< count the consecutive RATE_OCTAVE frames */
+    float    prev_lspf[10];
+    float    predictor_lspf[10];/*!< LSP predictor for RATE_OCTAVE and I_F_Q */
+    float    pitch_synthesis_filter_mem[303];
+    float    pitch_pre_filter_mem[303];
+    float    rnd_fir_filter_mem[180];
+    float    formant_mem[170];
+    float    last_codebook_gain;
+    int      prev_g1[2];
+    int      prev_bitrate;
+    float    pitch_gain[4];
+    uint8_t  pitch_lag[4];
+    uint16_t first16bits;
+    uint8_t  warned_buf_mismatch_bitrate;
+} QCELPContext;
+
+/**
+ * Reconstructs LPC coefficients from the line spectral pair frequencies.
+ *
+ * TIA/EIA/IS-733 2.4.3.3.5
+ */
+void ff_qcelp_lspf2lpc(const float *lspf, float *lpc);
 
-    for (i = 0; i < length; i++)
+static void weighted_vector_sumf(float *out, const float *in_a,
+                                 const float *in_b, float weight_coeff_a,
+                                 float weight_coeff_b, int length)
+{
+    int i;
+
+    for(i=0; i<length; i++)
         out[i] = weight_coeff_a * in_a[i]
                + weight_coeff_b * in_b[i];
 }
@@ -56,15 +97,224 @@ static void weighted_vector_sumf(float *out,
  *
  * TIA/EIA/IS-733 2.4.9
  */
-static av_cold int qcelp_decode_init(AVCodecContext *avctx) {
+static av_cold int qcelp_decode_init(AVCodecContext *avctx)
+{
     QCELPContext *q = avctx->priv_data;
     int i;
 
     avctx->sample_fmt = SAMPLE_FMT_FLT;
 
-    for (i = 0; i < 10; i++)
-        q->prev_lspf[i] = (i + 1) / 11.;
+    for(i=0; i<10; i++)
+        q->prev_lspf[i] = (i+1)/11.;
+
+    return 0;
+}
+
+/**
+ * Decodes the 10 quantized LSP frequencies from the LSPV/LSP
+ * transmission codes of any bitrate and checks for badly received packets.
+ *
+ * @param q the context
+ * @param lspf line spectral pair frequencies
+ *
+ * @return 0 on success, -1 if the packet is badly received
+ *
+ * TIA/EIA/IS-733 2.4.3.2.6.2-2, 2.4.8.7.3
+ */
+static int decode_lspf(QCELPContext *q, float *lspf)
+{
+    int i;
+    float tmp_lspf, smooth, erasure_coeff;
+    const float *predictors;
+
+    if(q->bitrate == RATE_OCTAVE || q->bitrate == I_F_Q)
+    {
+        predictors = (q->prev_bitrate != RATE_OCTAVE &&
+                       q->prev_bitrate != I_F_Q ?
+                       q->prev_lspf : q->predictor_lspf);
+
+        if(q->bitrate == RATE_OCTAVE)
+        {
+            q->octave_count++;
+
+            for(i=0; i<10; i++)
+            {
+                q->predictor_lspf[i] =
+                             lspf[i] = (q->frame.lspv[i] ?  QCELP_LSP_SPREAD_FACTOR
+                                                         : -QCELP_LSP_SPREAD_FACTOR)
+                                     + predictors[i] * QCELP_LSP_OCTAVE_PREDICTOR
+                                     + (i + 1) * ((1 - QCELP_LSP_OCTAVE_PREDICTOR)/11);
+            }
+            smooth = (q->octave_count < 10 ? .875 : 0.1);
+        }else
+        {
+            erasure_coeff = QCELP_LSP_OCTAVE_PREDICTOR;
+
+            assert(q->bitrate == I_F_Q);
+
+            if(q->erasure_count > 1)
+                erasure_coeff *= (q->erasure_count < 4 ? 0.9 : 0.7);
+
+            for(i=0; i<10; i++)
+            {
+                q->predictor_lspf[i] =
+                             lspf[i] = (i + 1) * ( 1 - erasure_coeff)/11
+                                     + erasure_coeff * predictors[i];
+            }
+            smooth = 0.125;
+        }
+
+        // Check the stability of the LSP frequencies.
+        lspf[0] = FFMAX(lspf[0], QCELP_LSP_SPREAD_FACTOR);
+        for(i=1; i<10; i++)
+            lspf[i] = FFMAX(lspf[i], (lspf[i-1] + QCELP_LSP_SPREAD_FACTOR));
+
+        lspf[9] = FFMIN(lspf[9], (1.0 - QCELP_LSP_SPREAD_FACTOR));
+        for(i=9; i>0; i--)
+            lspf[i-1] = FFMIN(lspf[i-1], (lspf[i] - QCELP_LSP_SPREAD_FACTOR));
+
+        // Low-pass filter the LSP frequencies.
+        weighted_vector_sumf(lspf, lspf, q->prev_lspf, smooth, 1.0-smooth, 10);
+    }else
+    {
+        q->octave_count = 0;
+
+        tmp_lspf = 0.;
+        for(i=0; i<5 ; i++)
+        {
+            lspf[2*i+0] = tmp_lspf += qcelp_lspvq[i][q->frame.lspv[i]][0] * 0.0001;
+            lspf[2*i+1] = tmp_lspf += qcelp_lspvq[i][q->frame.lspv[i]][1] * 0.0001;
+        }
+
+        // Check for badly received packets.
+        if(q->bitrate == RATE_QUARTER)
+        {
+            if(lspf[9] <= .70 || lspf[9] >=  .97)
+                return -1;
+            for(i=3; i<10; i++)
+                if(fabs(lspf[i] - lspf[i-2]) < .08)
+                    return -1;
+        }else
+        {
+            if(lspf[9] <= .66 || lspf[9] >= .985)
+                return -1;
+            for(i=4; i<10; i++)
+                if (fabs(lspf[i] - lspf[i-4]) < .0931)
+                    return -1;
+        }
+    }
+    return 0;
+}
+
+/**
+ * Converts codebook transmission codes to GAIN and INDEX.
+ *
+ * @param q the context
+ * @param gain array holding the decoded gain
+ *
+ * TIA/EIA/IS-733 2.4.6.2
+ */
+static void decode_gain_and_index(QCELPContext  *q,
+                                  float *gain) {
+    int   i, subframes_count, g1[16];
+    float slope;
+
+    if(q->bitrate >= RATE_QUARTER)
+    {
+        switch(q->bitrate)
+        {
+            case RATE_FULL: subframes_count = 16; break;
+            case RATE_HALF: subframes_count = 4;  break;
+            default:        subframes_count = 5;
+        }
+        for(i=0; i<subframes_count; i++)
+        {
+            g1[i] = 4 * q->frame.cbgain[i];
+            if(q->bitrate == RATE_FULL && !((i+1) & 3))
+            {
+                g1[i] += av_clip((g1[i-1] + g1[i-2] + g1[i-3]) / 3 - 6, 0, 32);
+            }
+
+            gain[i] = qcelp_g12ga[g1[i]];
+
+            if(q->frame.cbsign[i])
+            {
+                gain[i] = -gain[i];
+                q->frame.cindex[i] = (q->frame.cindex[i]-89) & 127;
+            }
+        }
+
+        q->prev_g1[0] = g1[i-2];
+        q->prev_g1[1] = g1[i-1];
+        q->last_codebook_gain = qcelp_g12ga[g1[i-1]];
+
+        if(q->bitrate == RATE_QUARTER)
+        {
+            // Provide smoothing of the unvoiced excitation energy.
+            gain[7] =     gain[4];
+            gain[6] = 0.4*gain[3] + 0.6*gain[4];
+            gain[5] =     gain[3];
+            gain[4] = 0.8*gain[2] + 0.2*gain[3];
+            gain[3] = 0.2*gain[1] + 0.8*gain[2];
+            gain[2] =     gain[1];
+            gain[1] = 0.6*gain[0] + 0.4*gain[1];
+        }
+    }else
+    {
+        if(q->bitrate == RATE_OCTAVE)
+        {
+            g1[0] = 2 * q->frame.cbgain[0]
+                  + av_clip((q->prev_g1[0] + q->prev_g1[1]) / 2 - 5, 0, 54);
+            subframes_count = 8;
+        }else
+        {
+            assert(q->bitrate == I_F_Q);
+
+            g1[0] = q->prev_g1[1];
+            switch(q->erasure_count)
+            {
+                case 1 : break;
+                case 2 : g1[0] -= 1; break;
+                case 3 : g1[0] -= 2; break;
+                default: g1[0] -= 6;
+            }
+            if(g1[0] < 0)
+                g1[0] = 0;
+            subframes_count = 4;
+        }
+        // This interpolation is done to produce smoother background noise.
+        slope = 0.5*(qcelp_g12ga[g1[0]] - q->last_codebook_gain) / subframes_count;
+        for(i=1; i<=subframes_count; i++)
+            gain[i-1] = q->last_codebook_gain + slope * i;
+
+        q->last_codebook_gain = gain[i-2];
+        q->prev_g1[0] = q->prev_g1[1];
+        q->prev_g1[1] = g1[0];
+    }
+}
+
+/**
+ * If the received packet is Rate 1/4 a further sanity check is made of the
+ * codebook gain.
+ *
+ * @param cbgain the unpacked cbgain array
+ * @return -1 if the sanity check fails, 0 otherwise
+ *
+ * TIA/EIA/IS-733 2.4.8.7.3
+ */
+static int codebook_sanity_check_for_rate_quarter(const uint8_t *cbgain)
+{
+    int i, diff, prev_diff=0;
 
+    for(i=1; i<5; i++)
+    {
+        diff = cbgain[i] - cbgain[i-1];
+        if(FFABS(diff) > 10)
+            return -1;
+        else if(FFABS(diff - prev_diff) > 12)
+            return -1;
+        prev_diff = diff;
+    }
     return 0;
 }
 
@@ -89,72 +339,84 @@ static av_cold int qcelp_decode_init(AVCodecContext *avctx) {
  * @param gain array holding the 4 pitch subframe gain values
  * @param cdn_vector array for the generated scaled codebook vector
  */
-static void compute_svector(const QCELPContext *q,
-                            const float *gain,
-                            float *cdn_vector) {
+static void compute_svector(QCELPContext *q, const float *gain,
+                            float *cdn_vector)
+{
     int      i, j, k;
     uint16_t cbseed, cindex;
     float    *rnd, tmp_gain, fir_filter_value;
 
-    switch (q->framerate) {
-    case RATE_FULL:
-        for (i = 0; i < 16; i++) {
-            tmp_gain = gain[i] * QCELP_RATE_FULL_CODEBOOK_RATIO;
-            cindex = -q->cindex[i];
-            for (j = 0; j < 10; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_full_codebook[cindex++ & 127];
-        }
+    switch(q->bitrate)
+    {
+        case RATE_FULL:
+            for(i=0; i<16; i++)
+            {
+                tmp_gain = gain[i] * QCELP_RATE_FULL_CODEBOOK_RATIO;
+                cindex = -q->frame.cindex[i];
+                for(j=0; j<10; j++)
+                    *cdn_vector++ = tmp_gain * qcelp_rate_full_codebook[cindex++ & 127];
+            }
         break;
-    case RATE_HALF:
-        for (i = 0; i < 4; i++) {
-            tmp_gain = gain[i] * QCELP_RATE_HALF_CODEBOOK_RATIO;
-            cindex = -q->cindex[i];
-            for (j = 0; j < 40; j++)
+        case RATE_HALF:
+            for(i=0; i<4; i++)
+            {
+                tmp_gain = gain[i] * QCELP_RATE_HALF_CODEBOOK_RATIO;
+                cindex = -q->frame.cindex[i];
+                for (j = 0; j < 40; j++)
                 *cdn_vector++ = tmp_gain * qcelp_rate_half_codebook[cindex++ & 127];
-        }
+            }
         break;
-    case RATE_QUARTER:
-        cbseed = (0x0003 & q->lspv[4])<<14 |
-                 (0x003F & q->lspv[3])<< 8 |
-                 (0x0060 & q->lspv[2])<< 1 |
-                 (0x0007 & q->lspv[1])<< 3 |
-                 (0x0038 & q->lspv[0])>> 3 ;
-        rnd = q->rnd_fir_filter_mem + 20;
-        for (i = 0; i < 8; i++) {
-            tmp_gain = gain[i] * (QCELP_SQRT1887 / 32768.0);
-            for (k = 0; k < 20; k++) {
-                cbseed = 521 * cbseed + 259;
-                *rnd = (int16_t)cbseed;
-
-                // FIR filter
-                fir_filter_value = 0.0;
-                for (j = 0; j < 10; j++)
-                    fir_filter_value += qcelp_rnd_fir_coefs[j ] * (rnd[-j ] + rnd[-20+j]);
-                fir_filter_value     += qcelp_rnd_fir_coefs[10] *  rnd[-10];
-
-                *cdn_vector++ = tmp_gain * fir_filter_value;
-                rnd++;
+        case RATE_QUARTER:
+            cbseed = (0x0003 & q->frame.lspv[4])<<14 |
+                     (0x003F & q->frame.lspv[3])<< 8 |
+                     (0x0060 & q->frame.lspv[2])<< 1 |
+                     (0x0007 & q->frame.lspv[1])<< 3 |
+                     (0x0038 & q->frame.lspv[0])>> 3 ;
+            rnd = q->rnd_fir_filter_mem + 20;
+            for(i=0; i<8; i++)
+            {
+                tmp_gain = gain[i] * (QCELP_SQRT1887 / 32768.0);
+                for(k=0; k<20; k++)
+                {
+                    cbseed = 521 * cbseed + 259;
+                    *rnd = (int16_t)cbseed;
+
+                    // FIR filter
+                    fir_filter_value = 0.0;
+                    for(j=0; j<10; j++)
+                        fir_filter_value += qcelp_rnd_fir_coefs[j ]
+                                          * (rnd[-j ] + rnd[-20+j]);
+
+                    fir_filter_value += qcelp_rnd_fir_coefs[10] * rnd[-10];
+                    *cdn_vector++ = tmp_gain * fir_filter_value;
+                    rnd++;
+                }
             }
-        }
-        memcpy(q->rnd_fir_filter_mem, q->rnd_fir_filter_mem + 160, 20 * sizeof(float));
+            memcpy(q->rnd_fir_filter_mem, q->rnd_fir_filter_mem + 160, 20 * sizeof(float));
         break;
-    case RATE_OCTAVE:
-        cbseed = q->first16bits;
-        for (i = 0; i < 8; i++) {
-            tmp_gain = gain[i] * (QCELP_SQRT1887 / 32768.0);
-            for (j = 0; j < 20; j++) {
-                cbseed = 521 * cbseed + 259;
-                *cdn_vector++ = tmp_gain * (int16_t)cbseed;
+        case RATE_OCTAVE:
+            cbseed = q->first16bits;
+            for(i=0; i<8; i++)
+            {
+                tmp_gain = gain[i] * (QCELP_SQRT1887 / 32768.0);
+                for(j=0; j<20; j++)
+                {
+                    cbseed = 521 * cbseed + 259;
+                    *cdn_vector++ = tmp_gain * (int16_t)cbseed;
+                }
             }
-        }
         break;
-    case I_F_Q:
-        cbseed = -44; // random codebook index
-        for (i = 0; i < 4; i++) {
-            tmp_gain = gain[i] * QCELP_RATE_FULL_CODEBOOK_RATIO;
-            for (j = 0; j < 40; j++)
-                *cdn_vector++ = tmp_gain * qcelp_rate_full_codebook[cbseed++ & 127];
-        }
+        case I_F_Q:
+            cbseed = -44; // random codebook index
+            for(i=0; i<4; i++)
+            {
+                tmp_gain = gain[i] * QCELP_RATE_FULL_CODEBOOK_RATIO;
+                for(j=0; j<40; j++)
+                    *cdn_vector++ = tmp_gain * qcelp_rate_full_codebook[cbseed++ & 127];
+            }
+        break;
+        case SILENCE:
+            memset(cdn_vector, 0, 160 * sizeof(float));
         break;
     }
 }
@@ -172,19 +434,21 @@ static void compute_svector(const QCELPContext *q,
  *
  * TIA/EIA/IS-733 2.4.8.3-2/3/4/5, 2.4.8.6
  */
-static void apply_gain_ctrl(float *v_out,
-                            const float *v_ref,
-                            const float *v_in) {
+static void apply_gain_ctrl(float *v_out, const float *v_ref,
+                            const float *v_in)
+{
     int   i, j, len;
     float scalefactor;
 
-    for (i = 0, j = 0; i < 4; i++) {
+    for(i=0, j=0; i<4; i++)
+    {
         scalefactor = ff_dot_productf(v_in + j, v_in + j, 40);
-        if (scalefactor)
-            scalefactor = sqrt(ff_dot_productf(v_ref + j, v_ref + j, 40) / scalefactor);
+        if(scalefactor)
+            scalefactor = sqrt(ff_dot_productf(v_ref + j, v_ref + j, 40)
+                        / scalefactor);
         else
-            av_log_missing_feature(NULL, "Zero energy for gain control", 1);
-        for (len = j + 40; j < len; j++)
+            ff_log_missing_feature(NULL, "Zero energy for gain control", 1);
+        for(len=j+40; j<len; j++)
             v_out[j] = scalefactor * v_in[j];
     }
 }
@@ -201,29 +465,33 @@ static void apply_gain_ctrl(float *v_out,
  * @param lag per-subframe lag array, each element is
  *        - between 16 and 143 if its corresponding pfrac is 0,
  *        - between 16 and 139 otherwise
- * @param pfrac per-subframe boolean array, 1 if the lag is fractional, 0 otherwise
+ * @param pfrac per-subframe boolean array, 1 if the lag is fractional, 0
+ *        otherwise
  *
  * @return filter output vector
  */
-static const float *do_pitchfilter(float memory[303],
-                                   const float v_in[160],
-                                   const float gain[4],
-                                   const uint8_t *lag,
-                                   const uint8_t pfrac[4]) {
+static const float *do_pitchfilter(float memory[303], const float v_in[160],
+                                   const float gain[4], const uint8_t *lag,
+                                   const uint8_t pfrac[4])
+{
     int         i, j;
     float       *v_lag, *v_out;
     const float *v_len;
 
     v_out = memory + 143; // Output vector starts at memory[143].
 
-    for (i = 0; i < 4; i++)
-        if (gain[i]) {
+    for(i=0; i<4; i++)
+    {
+        if(gain[i])
+        {
             v_lag = memory + 143 + 40 * i - lag[i];
-            for (v_len = v_in + 40; v_in < v_len; v_in++) {
-                if (pfrac[i]) { // If it is a fractional lag...
-                    for (j = 0, *v_out = 0.; j < 4; j++)
+            for(v_len=v_in+40; v_in<v_len; v_in++)
+            {
+                if(pfrac[i]) // If it is a fractional lag...
+                {
+                    for(j=0, *v_out=0.; j<4; j++)
                         *v_out += qcelp_hammsinc_table[j] * (v_lag[j-4] + v_lag[3-j]);
-                } else
+                }else
                     *v_out = *v_lag;
 
                 *v_out = *v_in + gain[i] * *v_out;
@@ -231,68 +499,313 @@ static const float *do_pitchfilter(float memory[303],
                 v_lag++;
                 v_out++;
             }
-        } else {
+        }else
+        {
             memcpy(v_out, v_in, 40 * sizeof(float));
             v_in  += 40;
             v_out += 40;
         }
+    }
 
     memmove(memory, memory + 160, 143 * sizeof(float));
     return memory + 143;
 }
 
 /**
+ * Apply pitch synthesis filter and pitch prefilter to the scaled codebook vector.
+ * TIA/EIA/IS-733 2.4.5.2, 2.4.8.7.2
+ *
+ * @param q the context
+ * @param cdn_vector the scaled codebook vector
+ */
+static void apply_pitch_filters(QCELPContext *q, float *cdn_vector)
+{
+    int         i;
+    const float *v_synthesis_filtered, *v_pre_filtered;
+
+    if(q->bitrate >= RATE_HALF ||
+       q->bitrate == SILENCE ||
+       (q->bitrate == I_F_Q && (q->prev_bitrate >= RATE_HALF)))
+    {
+
+        if(q->bitrate >= RATE_HALF)
+        {
+
+            // Compute gain & lag for the whole frame.
+            for(i=0; i<4; i++)
+            {
+                q->pitch_gain[i] = q->frame.plag[i] ? (q->frame.pgain[i] + 1) * 0.25 : 0.0;
+
+                q->pitch_lag[i] = q->frame.plag[i] + 16;
+            }
+        }else
+        {
+            float max_pitch_gain;
+
+            if (q->bitrate == I_F_Q)
+            {
+                  if (q->erasure_count < 3)
+                      max_pitch_gain = 0.9 - 0.3 * (q->erasure_count - 1);
+                  else
+                      max_pitch_gain = 0.0;
+            }else
+            {
+                assert(q->bitrate == SILENCE);
+                max_pitch_gain = 1.0;
+            }
+            for(i=0; i<4; i++)
+                q->pitch_gain[i] = FFMIN(q->pitch_gain[i], max_pitch_gain);
+
+            memset(q->frame.pfrac, 0, sizeof(q->frame.pfrac));
+        }
+
+        // pitch synthesis filter
+        v_synthesis_filtered = do_pitchfilter(q->pitch_synthesis_filter_mem,
+                                              cdn_vector, q->pitch_gain,
+                                              q->pitch_lag, q->frame.pfrac);
+
+        // pitch prefilter update
+        for(i=0; i<4; i++)
+            q->pitch_gain[i] = 0.5 * FFMIN(q->pitch_gain[i], 1.0);
+
+        v_pre_filtered = do_pitchfilter(q->pitch_pre_filter_mem,
+                                        v_synthesis_filtered,
+                                        q->pitch_gain, q->pitch_lag,
+                                        q->frame.pfrac);
+
+        apply_gain_ctrl(cdn_vector, v_synthesis_filtered, v_pre_filtered);
+    }else
+    {
+        memcpy(q->pitch_synthesis_filter_mem, cdn_vector + 17,
+               143 * sizeof(float));
+        memcpy(q->pitch_pre_filter_mem, cdn_vector + 17, 143 * sizeof(float));
+        memset(q->pitch_gain, 0, sizeof(q->pitch_gain));
+        memset(q->pitch_lag,  0, sizeof(q->pitch_lag));
+    }
+}
+
+/**
  * Interpolates LSP frequencies and computes LPC coefficients
- * for a given framerate & pitch subframe.
+ * for a given bitrate & pitch subframe.
  *
- * TIA/EIA/IS-733 2.4.3.3.4
+ * TIA/EIA/IS-733 2.4.3.3.4, 2.4.8.7.2
  *
  * @param q the context
  * @param curr_lspf LSP frequencies vector of the current frame
  * @param lpc float vector for the resulting LPC
  * @param subframe_num frame number in decoded stream
  */
-void interpolate_lpc(QCELPContext *q,
-                     const float *curr_lspf,
-                     float *lpc,
-                     const int subframe_num) {
+void interpolate_lpc(QCELPContext *q, const float *curr_lspf, float *lpc,
+                     const int subframe_num)
+{
     float interpolated_lspf[10];
     float weight;
 
-    if (q->framerate >= RATE_QUARTER) {
+    if(q->bitrate >= RATE_QUARTER)
         weight = 0.25 * (subframe_num + 1);
-    } else if (q->framerate == RATE_OCTAVE && !subframe_num) {
+    else if(q->bitrate == RATE_OCTAVE && !subframe_num)
         weight = 0.625;
-    } else {
+    else
         weight = 1.0;
+
+    if(weight != 1.0)
+    {
+        weighted_vector_sumf(interpolated_lspf, curr_lspf, q->prev_lspf,
+                             weight, 1.0 - weight, 10);
+        ff_qcelp_lspf2lpc(interpolated_lspf, lpc);
+    }else if(q->bitrate >= RATE_QUARTER ||
+             (q->bitrate == I_F_Q && !subframe_num))
+        ff_qcelp_lspf2lpc(curr_lspf, lpc);
+    else if(q->bitrate == SILENCE && !subframe_num)
+        ff_qcelp_lspf2lpc(q->prev_lspf, lpc);
+}
+
+static qcelp_packet_rate buf_size2bitrate(const int buf_size)
+{
+    switch(buf_size)
+    {
+        case 35: return RATE_FULL;
+        case 17: return RATE_HALF;
+        case  8: return RATE_QUARTER;
+        case  4: return RATE_OCTAVE;
+        case  1: return SILENCE;
     }
 
-    if (weight != 1.0) {
-        weighted_vector_sumf(interpolated_lspf, curr_lspf, q->prev_lspf, weight, 1.0 - weight, 10);
-        qcelp_lspf2lpc(interpolated_lspf, lpc);
-    } else if (q->framerate >= RATE_QUARTER || (q->framerate == I_F_Q && !subframe_num))
-        qcelp_lspf2lpc(curr_lspf, lpc);
+    return I_F_Q;
 }
 
-static int buf_size2framerate(const int buf_size) {
-    switch (buf_size) {
-    case 35:
-        return RATE_FULL;
-    case 17:
-        return RATE_HALF;
-    case  8:
-        return RATE_QUARTER;
-    case  4:
-        return RATE_OCTAVE;
-    case  1:
-        return SILENCE;
+/**
+ * Determine the bitrate from the frame size and/or the first byte of the frame.
+ *
+ * @param avctx the AV codec context
+ * @param buf_size length of the buffer
+ * @param buf the bufffer
+ *
+ * @return the bitrate on success,
+ *         I_F_Q  if the bitrate cannot be satisfactorily determined
+ *
+ * TIA/EIA/IS-733 2.4.8.7.1
+ */
+static qcelp_packet_rate determine_bitrate(AVCodecContext *avctx, const int buf_size,
+                             const uint8_t **buf)
+{
+    qcelp_packet_rate bitrate;
+
+    if((bitrate = buf_size2bitrate(buf_size)) >= 0)
+    {
+        if(bitrate > **buf)
+        {
+            QCELPContext *q = avctx->priv_data;
+            if (!q->warned_buf_mismatch_bitrate)
+            {
+            av_log(avctx, AV_LOG_WARNING,
+                   "Claimed bitrate and buffer size mismatch.\n");
+                q->warned_buf_mismatch_bitrate = 1;
+            }
+            bitrate = **buf;
+        }else if(bitrate < **buf)
+        {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Buffer is too small for the claimed bitrate.\n");
+            return I_F_Q;
+        }
+        (*buf)++;
+    }else if((bitrate = buf_size2bitrate(buf_size + 1)) >= 0)
+    {
+        av_log(avctx, AV_LOG_WARNING,
+               "Bitrate byte is missing, guessing the bitrate from packet size.\n");
+    }else
+        return I_F_Q;
+
+    if(bitrate == SILENCE)
+    {
+        //FIXME: Remove experimental warning when tested with samples.
+        av_log(avctx, AV_LOG_WARNING, "'Blank frame handling is experimental."
+                      " If you want to help, upload a sample "
+                      "of this file to ftp://upload.ffmpeg.org/MPlayer/incoming/ "
+                      "and contact the ffmpeg-devel mailing list.\n");
     }
-    return -1;
+    return bitrate;
 }
 
 static void warn_insufficient_frame_quality(AVCodecContext *avctx,
-                                            const char *message) {
-    av_log(avctx, AV_LOG_WARNING, "Frame #%d, IFQ: %s\n", avctx->frame_number, message);
+                                            const char *message)
+{
+    av_log(avctx, AV_LOG_WARNING, "Frame #%d, IFQ: %s\n", avctx->frame_number,
+           message);
+}
+
+static int qcelp_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
+                              const uint8_t *buf, int buf_size)
+{
+    QCELPContext *q = avctx->priv_data;
+    float *outbuffer = data;
+    int   i;
+    float quantized_lspf[10], lpc[10];
+    float gain[16];
+    float *formant_mem;
+
+    if((q->bitrate = determine_bitrate(avctx, buf_size, &buf)) == I_F_Q)
+    {
+        warn_insufficient_frame_quality(avctx, "bitrate cannot be determined.");
+        goto erasure;
+    }
+
+    if(q->bitrate == RATE_OCTAVE &&
+       (q->first16bits = AV_RB16(buf)) == 0xFFFF)
+    {
+        warn_insufficient_frame_quality(avctx, "Bitrate is 1/8 and first 16 bits are on.");
+        goto erasure;
+    }
+
+    if(q->bitrate > SILENCE)
+    {
+        const QCELPBitmap *bitmaps     = qcelp_unpacking_bitmaps_per_rate[q->bitrate];
+        const QCELPBitmap *bitmaps_end = qcelp_unpacking_bitmaps_per_rate[q->bitrate]
+                                       + qcelp_unpacking_bitmaps_lengths[q->bitrate];
+        uint8_t           *unpacked_data = (uint8_t *)&q->frame;
+
+        init_get_bits(&q->gb, buf, 8*buf_size);
+
+        memset(&q->frame, 0, sizeof(QCELPFrame));
+
+        for(; bitmaps < bitmaps_end; bitmaps++)
+            unpacked_data[bitmaps->index] |= get_bits(&q->gb, bitmaps->bitlen) << bitmaps->bitpos;
+
+        // Check for erasures/blanks on rates 1, 1/4 and 1/8.
+        if(q->frame.reserved)
+        {
+            warn_insufficient_frame_quality(avctx, "Wrong data in reserved frame area.");
+            goto erasure;
+        }
+        if(q->bitrate == RATE_QUARTER &&
+           codebook_sanity_check_for_rate_quarter(q->frame.cbgain))
+        {
+            warn_insufficient_frame_quality(avctx, "Codebook gain sanity check failed.");
+            goto erasure;
+        }
+
+        if(q->bitrate >= RATE_HALF)
+        {
+            for(i=0; i<4; i++)
+            {
+                if(q->frame.pfrac[i] && q->frame.plag[i] >= 124)
+                {
+                    warn_insufficient_frame_quality(avctx, "Cannot initialize pitch filter.");
+                    goto erasure;
+                }
+            }
+        }
+    }
+
+    decode_gain_and_index(q, gain);
+    compute_svector(q, gain, outbuffer);
+
+    if(decode_lspf(q, quantized_lspf) < 0)
+    {
+        warn_insufficient_frame_quality(avctx, "Badly received packets in frame.");
+        goto erasure;
+    }
+
+
+    apply_pitch_filters(q, outbuffer);
+
+    if(q->bitrate == I_F_Q)
+    {
+erasure:
+        q->bitrate = I_F_Q;
+        q->erasure_count++;
+        decode_gain_and_index(q, gain);
+        compute_svector(q, gain, outbuffer);
+        decode_lspf(q, quantized_lspf);
+        apply_pitch_filters(q, outbuffer);
+    }else
+        q->erasure_count = 0;
+
+    formant_mem = q->formant_mem + 10;
+    for(i=0; i<4; i++)
+    {
+        interpolate_lpc(q, quantized_lspf, lpc, i);
+        ff_celp_lp_synthesis_filterf(formant_mem, lpc, outbuffer + i * 40, 40,
+                                     10);
+        formant_mem += 40;
+    }
+    memcpy(q->formant_mem, q->formant_mem + 160, 10 * sizeof(float));
+
+    // FIXME: postfilter and final gain control should be here.
+    // TIA/EIA/IS-733 2.4.8.6
+
+    formant_mem = q->formant_mem + 10;
+    for(i=0; i<160; i++)
+        *outbuffer++ = av_clipf(*formant_mem++, QCELP_CLIP_LOWER_BOUND,
+                                QCELP_CLIP_UPPER_BOUND);
+
+    memcpy(q->prev_lspf, quantized_lspf, sizeof(q->prev_lspf));
+    q->prev_bitrate = q->bitrate;
+
+    *data_size = 160 * sizeof(*outbuffer);
+
+    return *data_size;
 }
 
 AVCodec qcelp_decoder =
diff --git a/libavcodec/ratecontrol.c b/libavcodec/ratecontrol.c
index 8e248b9..c88059a 100644
--- a/libavcodec/ratecontrol.c
+++ b/libavcodec/ratecontrol.c
@@ -461,7 +461,7 @@ static double modify_qscale(MpegEncContext *s, RateControlEntry *rce, double q,
             else if(d<0.0001) d=0.0001;
             q*= pow(d, 1.0/s->avctx->rc_buffer_aggressivity);
 
-            q_limit= bits2qp(rce, FFMAX((min_rate - buffer_size + rcc->buffer_index)*3, 1));
+            q_limit= bits2qp(rce, FFMAX((min_rate - buffer_size + rcc->buffer_index) * s->avctx->rc_min_vbv_overflow_use, 1));
             if(q > q_limit){
                 if(s->avctx->debug&FF_DEBUG_RC){
                     av_log(s->avctx, AV_LOG_DEBUG, "limiting QP %f -> %f\n", q, q_limit);
@@ -476,7 +476,7 @@ static double modify_qscale(MpegEncContext *s, RateControlEntry *rce, double q,
             else if(d<0.0001) d=0.0001;
             q/= pow(d, 1.0/s->avctx->rc_buffer_aggressivity);
 
-            q_limit= bits2qp(rce, FFMAX(rcc->buffer_index/3, 1));
+            q_limit= bits2qp(rce, FFMAX(rcc->buffer_index * s->avctx->rc_max_available_vbv_use, 1));
             if(q < q_limit){
                 if(s->avctx->debug&FF_DEBUG_RC){
                     av_log(s->avctx, AV_LOG_DEBUG, "limiting QP %f -> %f\n", q, q_limit);
diff --git a/libavcodec/raw.c b/libavcodec/raw.c
index c88830c..de10f28 100644
--- a/libavcodec/raw.c
+++ b/libavcodec/raw.c
@@ -51,6 +51,7 @@ const PixelFormatTag ff_raw_pixelFormatTags[] = {
     /* quicktime */
     { PIX_FMT_UYVY422, MKTAG('2', 'v', 'u', 'y') },
     { PIX_FMT_UYVY422, MKTAG('A', 'V', 'U', 'I') }, /* FIXME merge both fields */
+    { PIX_FMT_PAL8,    MKTAG('W', 'R', 'A', 'W') },
 
     { PIX_FMT_NONE, 0 },
 };
diff --git a/libavcodec/roqaudioenc.c b/libavcodec/roqaudioenc.c
index df014a4..28a6adc 100644
--- a/libavcodec/roqaudioenc.c
+++ b/libavcodec/roqaudioenc.c
@@ -35,7 +35,7 @@ static unsigned char dpcmValues[MAX_DPCM];
 typedef struct
 {
     short lastSample[2];
-} ROQDPCMContext_t;
+} ROQDPCMContext;
 
 static av_cold void roq_dpcm_table_init(void)
 {
@@ -51,7 +51,7 @@ static av_cold void roq_dpcm_table_init(void)
 
 static int roq_dpcm_encode_init(AVCodecContext *avctx)
 {
-    ROQDPCMContext_t *context = avctx->priv_data;
+    ROQDPCMContext *context = avctx->priv_data;
 
     if (avctx->channels > 2) {
         av_log(avctx, AV_LOG_ERROR, "Audio must be mono or stereo\n");
@@ -123,7 +123,7 @@ static int roq_dpcm_encode_frame(AVCodecContext *avctx,
     short *in;
     unsigned char *out;
 
-    ROQDPCMContext_t *context = avctx->priv_data;
+    ROQDPCMContext *context = avctx->priv_data;
 
     stereo = (avctx->channels == 2);
 
@@ -169,7 +169,7 @@ AVCodec roq_dpcm_encoder = {
     "roq_dpcm",
     CODEC_TYPE_AUDIO,
     CODEC_ID_ROQ_DPCM,
-    sizeof(ROQDPCMContext_t),
+    sizeof(ROQDPCMContext),
     roq_dpcm_encode_init,
     roq_dpcm_encode_frame,
     roq_dpcm_encode_close,
diff --git a/libavcodec/roqvideoenc.c b/libavcodec/roqvideoenc.c
index a115e58..3b405cc 100644
--- a/libavcodec/roqvideoenc.c
+++ b/libavcodec/roqvideoenc.c
@@ -190,20 +190,20 @@ typedef struct
     int subCels[4];
     motion_vect motion;
     int cbEntry;
-} subcel_evaluation_t;
+} SubcelEvaluation;
 
 typedef struct
 {
     int eval_dist[4];
     int best_coding;
 
-    subcel_evaluation_t subCels[4];
+    SubcelEvaluation subCels[4];
 
     motion_vect motion;
     int cbEntry;
 
     int sourceX, sourceY;
-} cel_evaluation_t;
+} CelEvaluation;
 
 typedef struct
 {
@@ -214,14 +214,14 @@ typedef struct
     uint8_t unpacked_cb2[MAX_CBS_2x2*2*2*3];
     uint8_t unpacked_cb4[MAX_CBS_4x4*4*4*3];
     uint8_t unpacked_cb4_enlarged[MAX_CBS_4x4*8*8*3];
-} roq_codebooks_t;
+} RoqCodebooks;
 
 /**
  * Temporary vars
  */
 typedef struct
 {
-    cel_evaluation_t *cel_evals;
+    CelEvaluation *cel_evals;
 
     int f2i4[MAX_CBS_4x4];
     int i2f4[MAX_CBS_4x4];
@@ -233,20 +233,20 @@ typedef struct
     int numCB4;
     int numCB2;
 
-    roq_codebooks_t codebooks;
+    RoqCodebooks codebooks;
 
     int *closest_cb2;
     int used_option[4];
-} roq_tempdata_t;
+} RoqTempdata;
 
 /**
  * Initializes cel evaluators and sets their source coordinates
  */
-static void create_cel_evals(RoqContext *enc, roq_tempdata_t *tempData)
+static void create_cel_evals(RoqContext *enc, RoqTempdata *tempData)
 {
     int n=0, x, y, i;
 
-    tempData->cel_evals = av_malloc(enc->width*enc->height/64 * sizeof(cel_evaluation_t));
+    tempData->cel_evals = av_malloc(enc->width*enc->height/64 * sizeof(CelEvaluation));
 
     /* Map to the ROQ quadtree order */
     for (y=0; y<enc->height; y+=16)
@@ -395,8 +395,8 @@ static void motion_search(RoqContext *enc, int blocksize)
 /**
  * Gets distortion for all options available to a subcel
  */
-static void gather_data_for_subcel(subcel_evaluation_t *subcel, int x,
-                                   int y, RoqContext *enc, roq_tempdata_t *tempData)
+static void gather_data_for_subcel(SubcelEvaluation *subcel, int x,
+                                   int y, RoqContext *enc, RoqTempdata *tempData)
 {
     uint8_t mb4[4*4*3];
     uint8_t mb2[2*2*3];
@@ -459,8 +459,8 @@ static void gather_data_for_subcel(subcel_evaluation_t *subcel, int x,
 /**
  * Gets distortion for all options available to a cel
  */
-static void gather_data_for_cel(cel_evaluation_t *cel, RoqContext *enc,
-                                roq_tempdata_t *tempData)
+static void gather_data_for_cel(CelEvaluation *cel, RoqContext *enc,
+                                RoqTempdata *tempData)
 {
     uint8_t mb8[8*8*3];
     int index = cel->sourceY*enc->width/64 + cel->sourceX/8;
@@ -533,7 +533,7 @@ static void gather_data_for_cel(cel_evaluation_t *cel, RoqContext *enc,
         }
 }
 
-static void remap_codebooks(RoqContext *enc, roq_tempdata_t *tempData)
+static void remap_codebooks(RoqContext *enc, RoqTempdata *tempData)
 {
     int i, j, idx=0;
 
@@ -565,7 +565,7 @@ static void remap_codebooks(RoqContext *enc, roq_tempdata_t *tempData)
 /**
  * Write codebook chunk
  */
-static void write_codebooks(RoqContext *enc, roq_tempdata_t *tempData)
+static void write_codebooks(RoqContext *enc, RoqTempdata *tempData)
 {
     int i, j;
     uint8_t **outp= &enc->out_buf;
@@ -620,7 +620,7 @@ static void write_typecode(CodingSpool *s, uint8_t type)
     }
 }
 
-static void reconstruct_and_encode_image(RoqContext *enc, roq_tempdata_t *tempData, int w, int h, int numBlocks)
+static void reconstruct_and_encode_image(RoqContext *enc, RoqTempdata *tempData, int w, int h, int numBlocks)
 {
     int i, j, k;
     int x, y;
@@ -628,7 +628,7 @@ static void reconstruct_and_encode_image(RoqContext *enc, roq_tempdata_t *tempDa
     int dist=0;
 
     roq_qcell *qcell;
-    cel_evaluation_t *eval;
+    CelEvaluation *eval;
 
     CodingSpool spool;
 
@@ -789,7 +789,7 @@ static void create_clusters(AVFrame *frame, int w, int h, uint8_t *yuvClusters)
         }
 }
 
-static void generate_codebook(RoqContext *enc, roq_tempdata_t *tempdata,
+static void generate_codebook(RoqContext *enc, RoqTempdata *tempdata,
                               int *points, int inputCount, roq_cell *results,
                               int size, int cbsize)
 {
@@ -824,10 +824,10 @@ static void generate_codebook(RoqContext *enc, roq_tempdata_t *tempdata,
     av_free(codebook);
 }
 
-static void generate_new_codebooks(RoqContext *enc, roq_tempdata_t *tempData)
+static void generate_new_codebooks(RoqContext *enc, RoqTempdata *tempData)
 {
     int i,j;
-    roq_codebooks_t *codebooks = &tempData->codebooks;
+    RoqCodebooks *codebooks = &tempData->codebooks;
     int max = enc->width*enc->height/16;
     uint8_t mb2[3*4];
     roq_cell *results4 = av_malloc(sizeof(roq_cell)*MAX_CBS_4x4*4);
@@ -880,7 +880,7 @@ static void generate_new_codebooks(RoqContext *enc, roq_tempdata_t *tempData)
 
 static void roq_encode_video(RoqContext *enc)
 {
-    roq_tempdata_t tempData;
+    RoqTempdata tempData;
     int i;
 
     memset(&tempData, 0, sizeof(tempData));
diff --git a/libavcodec/rv10.c b/libavcodec/rv10.c
index 0c05147..ad09c07 100644
--- a/libavcodec/rv10.c
+++ b/libavcodec/rv10.c
@@ -527,6 +527,11 @@ static av_cold int rv10_decode_init(AVCodecContext *avctx)
     MpegEncContext *s = avctx->priv_data;
     static int done=0;
 
+    if (avctx->extradata_size < 8) {
+        av_log(avctx, AV_LOG_ERROR, "Extradata is too small.\n");
+        return -1;
+    }
+
     MPV_decode_defaults(s);
 
     s->avctx= avctx;
diff --git a/libavcodec/rv30.c b/libavcodec/rv30.c
index 0202873..97d87b0 100644
--- a/libavcodec/rv30.c
+++ b/libavcodec/rv30.c
@@ -112,6 +112,127 @@ static int rv30_decode_mb_info(RV34DecContext *r)
         return rv30_b_types[code];
 }
 
+static inline void rv30_weak_loop_filter(uint8_t *src, const int step,
+                                         const int stride, const int lim)
+{
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int i, diff;
+
+    for(i = 0; i < 4; i++){
+        diff = ((src[-2*step] - src[1*step]) - (src[-1*step] - src[0*step])*4) >> 3;
+        diff = av_clip(diff, -lim, lim);
+        src[-1*step] = cm[src[-1*step] + diff];
+        src[ 0*step] = cm[src[ 0*step] - diff];
+        src += stride;
+    }
+}
+
+static void rv30_loop_filter(RV34DecContext *r, int row)
+{
+    MpegEncContext *s = &r->s;
+    int mb_pos, mb_x;
+    int i, j, k;
+    uint8_t *Y, *C;
+    int loc_lim, cur_lim, left_lim = 0, top_lim = 0;
+
+    mb_pos = row * s->mb_stride;
+    for(mb_x = 0; mb_x < s->mb_width; mb_x++, mb_pos++){
+        int mbtype = s->current_picture_ptr->mb_type[mb_pos];
+        if(IS_INTRA(mbtype) || IS_SEPARATE_DC(mbtype))
+            r->deblock_coefs[mb_pos] = 0xFFFF;
+        if(IS_INTRA(mbtype))
+            r->cbp_chroma[mb_pos] = 0xFF;
+    }
+
+    /* all vertical edges are filtered first
+     * and horizontal edges are filtered on the next iteration
+     */
+    mb_pos = row * s->mb_stride;
+    for(mb_x = 0; mb_x < s->mb_width; mb_x++, mb_pos++){
+        cur_lim = rv30_loop_filt_lim[s->current_picture_ptr->qscale_table[mb_pos]];
+        if(mb_x)
+            left_lim = rv30_loop_filt_lim[s->current_picture_ptr->qscale_table[mb_pos - 1]];
+        for(j = 0; j < 16; j += 4){
+            Y = s->current_picture_ptr->data[0] + mb_x*16 + (row*16 + j) * s->linesize + 4 * !mb_x;
+            for(i = !mb_x; i < 4; i++, Y += 4){
+                int ij = i + j;
+                loc_lim = 0;
+                if(r->deblock_coefs[mb_pos] & (1 << ij))
+                    loc_lim = cur_lim;
+                else if(!i && r->deblock_coefs[mb_pos - 1] & (1 << (ij + 3)))
+                    loc_lim = left_lim;
+                else if( i && r->deblock_coefs[mb_pos]     & (1 << (ij - 1)))
+                    loc_lim = cur_lim;
+                if(loc_lim)
+                    rv30_weak_loop_filter(Y, 1, s->linesize, loc_lim);
+            }
+        }
+        for(k = 0; k < 2; k++){
+            int cur_cbp, left_cbp = 0;
+            cur_cbp = (r->cbp_chroma[mb_pos] >> (k*4)) & 0xF;
+            if(mb_x)
+                left_cbp = (r->cbp_chroma[mb_pos - 1] >> (k*4)) & 0xF;
+            for(j = 0; j < 8; j += 4){
+                C = s->current_picture_ptr->data[k+1] + mb_x*8 + (row*8 + j) * s->uvlinesize + 4 * !mb_x;
+                for(i = !mb_x; i < 2; i++, C += 4){
+                    int ij = i + (j >> 1);
+                    loc_lim = 0;
+                    if(cur_cbp && (1 << ij))
+                        loc_lim = cur_lim;
+                    else if(!i && left_cbp & (1 << (ij + 1)))
+                        loc_lim = left_lim;
+                    else if( i && cur_cbp  & (1 << (ij - 1)))
+                        loc_lim = cur_lim;
+                    if(loc_lim)
+                        rv30_weak_loop_filter(C, 1, s->uvlinesize, loc_lim);
+                }
+            }
+        }
+    }
+    mb_pos = row * s->mb_stride;
+    for(mb_x = 0; mb_x < s->mb_width; mb_x++, mb_pos++){
+        cur_lim = rv30_loop_filt_lim[s->current_picture_ptr->qscale_table[mb_pos]];
+        if(row)
+            top_lim = rv30_loop_filt_lim[s->current_picture_ptr->qscale_table[mb_pos - s->mb_stride]];
+        for(j = 4*!row; j < 16; j += 4){
+            Y = s->current_picture_ptr->data[0] + mb_x*16 + (row*16 + j) * s->linesize;
+            for(i = 0; i < 4; i++, Y += 4){
+                int ij = i + j;
+                loc_lim = 0;
+                if(r->deblock_coefs[mb_pos] & (1 << ij))
+                    loc_lim = cur_lim;
+                else if(!j && r->deblock_coefs[mb_pos - s->mb_stride] & (1 << (ij + 12)))
+                    loc_lim = top_lim;
+                else if( j && r->deblock_coefs[mb_pos]                & (1 << (ij - 4)))
+                    loc_lim = cur_lim;
+                if(loc_lim)
+                    rv30_weak_loop_filter(Y, s->linesize, 1, loc_lim);
+            }
+        }
+        for(k = 0; k < 2; k++){
+            int cur_cbp, top_cbp = 0;
+            cur_cbp = (r->cbp_chroma[mb_pos] >> (k*4)) & 0xF;
+            if(row)
+                top_cbp = (r->cbp_chroma[mb_pos - s->mb_stride] >> (k*4)) & 0xF;
+            for(j = 4*!row; j < 8; j += 4){
+                C = s->current_picture_ptr->data[k+1] + mb_x*8 + (row*8 + j) * s->uvlinesize;
+                for(i = 0; i < 2; i++, C += 4){
+                    int ij = i + (j >> 1);
+                    loc_lim = 0;
+                    if(r->cbp_chroma[mb_pos] && (1 << ij))
+                        loc_lim = cur_lim;
+                    else if(!j && top_cbp & (1 << (ij + 2)))
+                        loc_lim = top_lim;
+                    else if( j && cur_cbp & (1 << (ij - 2)))
+                        loc_lim = cur_lim;
+                    if(loc_lim)
+                        rv30_weak_loop_filter(C, s->uvlinesize, 1, loc_lim);
+                }
+            }
+        }
+    }
+}
+
 /**
  * Initialize decoder.
  */
@@ -130,6 +251,7 @@ static av_cold int rv30_decode_init(AVCodecContext *avctx)
     r->parse_slice_header = rv30_parse_slice_header;
     r->decode_intra_types = rv30_decode_intra_types;
     r->decode_mb_info     = rv30_decode_mb_info;
+    r->loop_filter        = rv30_loop_filter;
     r->luma_dc_quant_i = rv30_luma_dc_quant;
     r->luma_dc_quant_p = rv30_luma_dc_quant;
     return 0;
diff --git a/libavcodec/rv30data.h b/libavcodec/rv30data.h
index c16e51b..1662fd7 100644
--- a/libavcodec/rv30data.h
+++ b/libavcodec/rv30data.h
@@ -171,4 +171,11 @@ static const uint8_t rv30_itype_from_context[900] = {
     2, 7, 8, 4, 0, 6, 1, 5, 3,
     2, 8, 3, 0, 7, 4, 1, 6, 5,
 };
+
+/**
+ * Loop filter limits are taken from this table.
+ */
+static const uint8_t rv30_loop_filt_lim[32] = {
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5
+};
 #endif /* AVCODEC_RV30DATA_H */
diff --git a/libavcodec/rv30dsp.c b/libavcodec/rv30dsp.c
index 13b218b..f693a14 100644
--- a/libavcodec/rv30dsp.c
+++ b/libavcodec/rv30dsp.c
@@ -34,14 +34,14 @@ static av_unused void OPNAME ## rv30_tpel8_h_lowpass(uint8_t *dst, uint8_t *src,
     int i;\
     for(i=0; i<h; i++)\
     {\
-        OP(dst[0], -(src[-1]+src[2]) + src[0]*C1 + src[1]*C2);\
-        OP(dst[1], -(src[ 0]+src[3]) + src[1]*C1 + src[2]*C2);\
-        OP(dst[2], -(src[ 1]+src[4]) + src[2]*C1 + src[3]*C2);\
-        OP(dst[3], -(src[ 2]+src[5]) + src[3]*C1 + src[4]*C2);\
-        OP(dst[4], -(src[ 3]+src[6]) + src[4]*C1 + src[5]*C2);\
-        OP(dst[5], -(src[ 4]+src[7]) + src[5]*C1 + src[6]*C2);\
-        OP(dst[6], -(src[ 5]+src[8]) + src[6]*C1 + src[7]*C2);\
-        OP(dst[7], -(src[ 6]+src[9]) + src[7]*C1 + src[8]*C2);\
+        OP(dst[0], (-(src[-1]+src[2]) + src[0]*C1 + src[1]*C2 + 8)>>4);\
+        OP(dst[1], (-(src[ 0]+src[3]) + src[1]*C1 + src[2]*C2 + 8)>>4);\
+        OP(dst[2], (-(src[ 1]+src[4]) + src[2]*C1 + src[3]*C2 + 8)>>4);\
+        OP(dst[3], (-(src[ 2]+src[5]) + src[3]*C1 + src[4]*C2 + 8)>>4);\
+        OP(dst[4], (-(src[ 3]+src[6]) + src[4]*C1 + src[5]*C2 + 8)>>4);\
+        OP(dst[5], (-(src[ 4]+src[7]) + src[5]*C1 + src[6]*C2 + 8)>>4);\
+        OP(dst[6], (-(src[ 5]+src[8]) + src[6]*C1 + src[7]*C2 + 8)>>4);\
+        OP(dst[7], (-(src[ 6]+src[9]) + src[7]*C1 + src[8]*C2 + 8)>>4);\
         dst+=dstStride;\
         src+=srcStride;\
     }\
@@ -64,71 +64,92 @@ static void OPNAME ## rv30_tpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstSt
         const int src7= src[7 *srcStride];\
         const int src8= src[8 *srcStride];\
         const int src9= src[9 *srcStride];\
-        OP(dst[0*dstStride], -(srcA+src2) + src0*C1 + src1*C2);\
-        OP(dst[1*dstStride], -(src0+src3) + src1*C1 + src2*C2);\
-        OP(dst[2*dstStride], -(src1+src4) + src2*C1 + src3*C2);\
-        OP(dst[3*dstStride], -(src2+src5) + src3*C1 + src4*C2);\
-        OP(dst[4*dstStride], -(src3+src6) + src4*C1 + src5*C2);\
-        OP(dst[5*dstStride], -(src4+src7) + src5*C1 + src6*C2);\
-        OP(dst[6*dstStride], -(src5+src8) + src6*C1 + src7*C2);\
-        OP(dst[7*dstStride], -(src6+src9) + src7*C1 + src8*C2);\
+        OP(dst[0*dstStride], (-(srcA+src2) + src0*C1 + src1*C2 + 8)>>4);\
+        OP(dst[1*dstStride], (-(src0+src3) + src1*C1 + src2*C2 + 8)>>4);\
+        OP(dst[2*dstStride], (-(src1+src4) + src2*C1 + src3*C2 + 8)>>4);\
+        OP(dst[3*dstStride], (-(src2+src5) + src3*C1 + src4*C2 + 8)>>4);\
+        OP(dst[4*dstStride], (-(src3+src6) + src4*C1 + src5*C2 + 8)>>4);\
+        OP(dst[5*dstStride], (-(src4+src7) + src5*C1 + src6*C2 + 8)>>4);\
+        OP(dst[6*dstStride], (-(src5+src8) + src6*C1 + src7*C2 + 8)>>4);\
+        OP(dst[7*dstStride], (-(src6+src9) + src7*C1 + src8*C2 + 8)>>4);\
         dst++;\
         src++;\
     }\
 }\
 \
-static void OPNAME ## rv30_tpel8_h3_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int h=8+2;\
+static void OPNAME ## rv30_tpel8_hv_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    const int w = 8;\
+    const int h = 8;\
     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<h; i++)\
-    {\
-        OP(dst[0], 6*src[0]+9*src[1]+src[2]);\
-        OP(dst[1], 6*src[1]+9*src[2]+src[3]);\
-        OP(dst[2], 6*src[2]+9*src[3]+src[4]);\
-        OP(dst[3], 6*src[3]+9*src[4]+src[5]);\
-        OP(dst[4], 6*src[4]+9*src[5]+src[6]);\
-        OP(dst[5], 6*src[5]+9*src[6]+src[7]);\
-        OP(dst[6], 6*src[6]+9*src[7]+src[8]);\
-        OP(dst[7], 6*src[7]+9*src[8]+src[9]);\
-        dst+=dstStride;\
-        src+=srcStride;\
+    int i, j;\
+    for(j = 0; j < h; j++){\
+        for(i = 0; i < w; i++){\
+            OP(dst[i], (\
+                  src[srcStride*-1+i-1]  -12*src[srcStride*-1+i]  -6*src[srcStride*-1+i+1]    +src[srcStride*-1+i+2]+\
+              -12*src[srcStride* 0+i-1] +144*src[srcStride* 0+i] +72*src[srcStride* 0+i+1] -12*src[srcStride* 0+i+2] +\
+               -6*src[srcStride* 1+i-1]  +72*src[srcStride* 1+i] +36*src[srcStride* 1+i+1]  -6*src[srcStride* 1+i+2] +\
+                  src[srcStride* 2+i-1]  -12*src[srcStride* 2+i]  -6*src[srcStride* 2+i+1]    +src[srcStride* 2+i+2] +\
+                  128)>>8);\
+        }\
+        src += srcStride;\
+        dst += dstStride;\
     }\
 }\
 \
-static void OPNAME ## rv30_tpel8_v3_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    const int w=8;\
+static void OPNAME ## rv30_tpel8_hhv_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    const int w = 8;\
+    const int h = 8;\
     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
-    int i;\
-    for(i=0; i<w; i++)\
-    {\
-        const int src0= src[0 *srcStride];\
-        const int src1= src[1 *srcStride];\
-        const int src2= src[2 *srcStride];\
-        const int src3= src[3 *srcStride];\
-        const int src4= src[4 *srcStride];\
-        const int src5= src[5 *srcStride];\
-        const int src6= src[6 *srcStride];\
-        const int src7= src[7 *srcStride];\
-        const int src8= src[8 *srcStride];\
-        const int src9= src[9 *srcStride];\
-        OP(dst[0*dstStride], 6*src0 + 9*src1 + src2);\
-        OP(dst[1*dstStride], 6*src1 + 9*src2 + src3);\
-        OP(dst[2*dstStride], 6*src2 + 9*src3 + src4);\
-        OP(dst[3*dstStride], 6*src3 + 9*src4 + src5);\
-        OP(dst[4*dstStride], 6*src4 + 9*src5 + src6);\
-        OP(dst[5*dstStride], 6*src5 + 9*src6 + src7);\
-        OP(dst[6*dstStride], 6*src6 + 9*src7 + src8);\
-        OP(dst[7*dstStride], 6*src7 + 9*src8 + src9);\
-        dst ++;\
-        src ++;\
+    int i, j;\
+    for(j = 0; j < h; j++){\
+        for(i = 0; i < w; i++){\
+            OP(dst[i], (\
+                  src[srcStride*-1+i-1]  -12*src[srcStride*-1+i+1]  -6*src[srcStride*-1+i]    +src[srcStride*-1+i+2]+\
+              -12*src[srcStride* 0+i-1] +144*src[srcStride* 0+i+1] +72*src[srcStride* 0+i] -12*src[srcStride* 0+i+2]+\
+               -6*src[srcStride* 1+i-1]  +72*src[srcStride* 1+i+1] +36*src[srcStride* 1+i]  -6*src[srcStride* 1+i+2]+\
+                  src[srcStride* 2+i-1]  -12*src[srcStride* 2+i+1]  -6*src[srcStride* 2+i]    +src[srcStride* 2+i+2]+\
+                  128)>>8);\
+        }\
+        src += srcStride;\
+        dst += dstStride;\
     }\
 }\
 \
-static void OPNAME ## rv30_tpel8_hv_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
-    uint8_t half[8*10];\
-    put_rv30_tpel8_h3_lowpass(half, src, 8, srcStride);\
-    OPNAME ## rv30_tpel8_v3_lowpass(dst, half, dstStride, 8);\
+static void OPNAME ## rv30_tpel8_hvv_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    const int w = 8;\
+    const int h = 8;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i, j;\
+    for(j = 0; j < h; j++){\
+        for(i = 0; i < w; i++){\
+            OP(dst[i], (\
+                  src[srcStride*-1+i-1]  -12*src[srcStride*-1+i]  -6*src[srcStride*-1+i+1]    +src[srcStride*-1+i+2]+\
+               -6*src[srcStride* 0+i-1]  +72*src[srcStride* 0+i] +36*src[srcStride* 0+i+1]  -6*src[srcStride* 0+i+2]+\
+              -12*src[srcStride* 1+i-1] +144*src[srcStride* 1+i] +72*src[srcStride* 1+i+1] -12*src[srcStride* 1+i+2]+\
+                  src[srcStride* 2+i-1]  -12*src[srcStride* 2+i]  -6*src[srcStride* 2+i+1]    +src[srcStride* 2+i+2]+\
+                  128)>>8);\
+        }\
+        src += srcStride;\
+        dst += dstStride;\
+    }\
+}\
+\
+static void OPNAME ## rv30_tpel8_hhvv_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    const int w = 8;\
+    const int h = 8;\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i, j;\
+    for(j = 0; j < h; j++){\
+        for(i = 0; i < w; i++){\
+            OP(dst[i], (\
+               36*src[i+srcStride*0] +54*src[i+1+srcStride*0] +6*src[i+2+srcStride*0]+\
+               54*src[i+srcStride*1] +81*src[i+1+srcStride*1] +9*src[i+2+srcStride*1]+\
+                6*src[i+srcStride*2] + 9*src[i+1+srcStride*2] +  src[i+2+srcStride*2]+\
+               128)>>8);\
+        }\
+        src += srcStride;\
+        dst += dstStride;\
+    }\
 }\
 \
 static void OPNAME ## rv30_tpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, const int C1, const int C2){\
@@ -158,6 +179,33 @@ static void OPNAME ## rv30_tpel16_hv_lowpass(uint8_t *dst, uint8_t *src, int dst
     OPNAME ## rv30_tpel8_hv_lowpass(dst+8, src+8, dstStride, srcStride);\
 }\
 \
+static void OPNAME ## rv30_tpel16_hhv_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## rv30_tpel8_hhv_lowpass(dst  , src  , dstStride, srcStride);\
+    OPNAME ## rv30_tpel8_hhv_lowpass(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## rv30_tpel8_hhv_lowpass(dst  , src  , dstStride, srcStride);\
+    OPNAME ## rv30_tpel8_hhv_lowpass(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static void OPNAME ## rv30_tpel16_hvv_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## rv30_tpel8_hvv_lowpass(dst  , src  , dstStride, srcStride);\
+    OPNAME ## rv30_tpel8_hvv_lowpass(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## rv30_tpel8_hvv_lowpass(dst  , src  , dstStride, srcStride);\
+    OPNAME ## rv30_tpel8_hvv_lowpass(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static void OPNAME ## rv30_tpel16_hhvv_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## rv30_tpel8_hhvv_lowpass(dst  , src  , dstStride, srcStride);\
+    OPNAME ## rv30_tpel8_hhvv_lowpass(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## rv30_tpel8_hhvv_lowpass(dst  , src  , dstStride, srcStride);\
+    OPNAME ## rv30_tpel8_hhvv_lowpass(dst+8, src+8, dstStride, srcStride);\
+}\
+\
 
 #define RV30_MC(OPNAME, SIZE) \
 static void OPNAME ## rv30_tpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
@@ -177,30 +225,24 @@ static void OPNAME ## rv30_tpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, i
 }\
 \
 static void OPNAME ## rv30_tpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[SIZE*SIZE];\
-    put_rv30_tpel ## SIZE ## _h_lowpass(half, src, SIZE, stride, 12, 6);\
-    OPNAME ## rv30_tpel ## SIZE ## _v_lowpass(dst, src, stride, stride, 12, 6);\
+    OPNAME ## rv30_tpel ## SIZE ## _hv_lowpass(dst, src, stride, stride);\
 }\
 \
 static void OPNAME ## rv30_tpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[SIZE*SIZE];\
-    put_rv30_tpel ## SIZE ## _h_lowpass(half, src, SIZE, stride, 12, 6);\
-    OPNAME ## rv30_tpel ## SIZE ## _v_lowpass(dst, src, stride, stride, 6, 12);\
+    OPNAME ## rv30_tpel ## SIZE ## _hvv_lowpass(dst, src, stride, stride);\
 }\
 \
 static void OPNAME ## rv30_tpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t half[SIZE*SIZE];\
-    put_rv30_tpel ## SIZE ## _h_lowpass(half, src, SIZE, stride, 6, 12);\
-    OPNAME ## rv30_tpel ## SIZE ## _v_lowpass(dst, src, stride, stride, 12, 6);\
+    OPNAME ## rv30_tpel ## SIZE ## _hhv_lowpass(dst, src, stride, stride);\
 }\
 \
 static void OPNAME ## rv30_tpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## rv30_tpel ## SIZE ## _hv_lowpass(dst, src, stride, stride);\
+    OPNAME ## rv30_tpel ## SIZE ## _hhvv_lowpass(dst, src, stride, stride);\
 }\
 \
 
-#define op_avg(a, b)  a = (((a)+cm[((b) + 8)>>4]+1)>>1)
-#define op_put(a, b)  a = cm[((b) + 8)>>4]
+#define op_avg(a, b)  a = (((a)+cm[b]+1)>>1)
+#define op_put(a, b)  a = cm[b]
 
 RV30_LOWPASS(put_       , op_put)
 RV30_LOWPASS(avg_       , op_avg)
diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index 937c23b..450327e 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -367,7 +367,7 @@ int ff_rv34_get_start_offset(GetBitContext *gb, int mb_size)
 {
     int i;
     for(i = 0; i < 5; i++)
-        if(rv34_mb_max_sizes[i] > mb_size)
+        if(rv34_mb_max_sizes[i] >= mb_size - 1)
             break;
     return rv34_mb_bits_sizes[i];
 }
@@ -564,7 +564,7 @@ static void rv34_pred_mv_rv3(RV34DecContext *r, int block_type, int dir)
     MpegEncContext *s = &r->s;
     int mv_pos = s->mb_x * 2 + s->mb_y * 2 * s->b8_stride;
     int A[2] = {0}, B[2], C[2];
-    int i, j;
+    int i, j, k;
     int mx, my;
     int avail_index = avail_indexes[0];
 
@@ -597,12 +597,12 @@ static void rv34_pred_mv_rv3(RV34DecContext *r, int block_type, int dir)
     my += r->dmv[0][1];
     for(j = 0; j < 2; j++){
         for(i = 0; i < 2; i++){
-            s->current_picture_ptr->motion_val[0][mv_pos + i + j*s->b8_stride][0] = mx;
-            s->current_picture_ptr->motion_val[0][mv_pos + i + j*s->b8_stride][1] = my;
+            for(k = 0; k < 2; k++){
+                s->current_picture_ptr->motion_val[k][mv_pos + i + j*s->b8_stride][0] = mx;
+                s->current_picture_ptr->motion_val[k][mv_pos + i + j*s->b8_stride][1] = my;
+            }
         }
     }
-    if(block_type == RV34_MB_B_BACKWARD || block_type == RV34_MB_B_FORWARD)
-        fill_rectangle(s->current_picture_ptr->motion_val[!dir][mv_pos], 2, 2, s->b8_stride, 0, 4);
 }
 
 static const int chroma_coeffs[3] = { 0, 3, 5 };
@@ -644,14 +644,20 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
         uvmx = chroma_coeffs[(chroma_mx + (3 << 24)) % 3];
         uvmy = chroma_coeffs[(chroma_my + (3 << 24)) % 3];
     }else{
+        int cx, cy;
         mx = s->current_picture_ptr->motion_val[dir][mv_pos][0] >> 2;
         my = s->current_picture_ptr->motion_val[dir][mv_pos][1] >> 2;
         lx = s->current_picture_ptr->motion_val[dir][mv_pos][0] & 3;
         ly = s->current_picture_ptr->motion_val[dir][mv_pos][1] & 3;
-        umx = mx >> 1;
-        umy = my >> 1;
-        uvmx = mx & 6;
-        uvmy = my & 6;
+        cx = s->current_picture_ptr->motion_val[dir][mv_pos][0] / 2;
+        cy = s->current_picture_ptr->motion_val[dir][mv_pos][1] / 2;
+        umx = cx >> 2;
+        umy = cy >> 2;
+        uvmx = (cx & 3) << 1;
+        uvmy = (cy & 3) << 1;
+        //due to some flaw RV40 uses the same MC compensation routine for H2V2 and H3V3
+        if(uvmx == 6 && uvmy == 6)
+            uvmx = uvmy = 4;
     }
     dxy = ly*4 + lx;
     srcY = dir ? s->next_picture_ptr->data[0] : s->last_picture_ptr->data[0];
@@ -664,12 +670,12 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type,
     srcY += src_y * s->linesize + src_x;
     srcU += uvsrc_y * s->uvlinesize + uvsrc_x;
     srcV += uvsrc_y * s->uvlinesize + uvsrc_x;
-    if(   (unsigned)(src_x - !!lx*2) > s->h_edge_pos - !!lx*2 - (width <<3) - 3
-       || (unsigned)(src_y - !!ly*2) > s->v_edge_pos - !!ly*2 - (height<<3) - 3){
-        uint8_t *uvbuf= s->edge_emu_buffer + 20 * s->linesize;
+    if(   (unsigned)(src_x - !!lx*2) > s->h_edge_pos - !!lx*2 - (width <<3) - 4
+       || (unsigned)(src_y - !!ly*2) > s->v_edge_pos - !!ly*2 - (height<<3) - 4){
+        uint8_t *uvbuf= s->edge_emu_buffer + 22 * s->linesize;
 
         srcY -= 2 + 2*s->linesize;
-        ff_emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, (width<<3)+4, (height<<3)+4,
+        ff_emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, (width<<3)+6, (height<<3)+6,
                             src_x - 2, src_y - 2, s->h_edge_pos, s->v_edge_pos);
         srcY = s->edge_emu_buffer + 2 + 2*s->linesize;
         ff_emulated_edge_mc(uvbuf     , srcU, s->uvlinesize, (width<<2)+1, (height<<2)+1,
@@ -704,20 +710,23 @@ static void rv34_mc_1mv(RV34DecContext *r, const int block_type,
 {
     rv34_mc(r, block_type, xoff, yoff, mv_off, width, height, dir, r->rv30,
             r->rv30 ? r->s.dsp.put_rv30_tpel_pixels_tab
-                    : r->s.dsp.put_h264_qpel_pixels_tab,
-            r->s.dsp.put_h264_chroma_pixels_tab);
+                    : r->s.dsp.put_rv40_qpel_pixels_tab,
+            r->rv30 ? r->s.dsp.put_h264_chroma_pixels_tab
+                    : r->s.dsp.put_rv40_chroma_pixels_tab);
 }
 
 static void rv34_mc_2mv(RV34DecContext *r, const int block_type)
 {
     rv34_mc(r, block_type, 0, 0, 0, 2, 2, 0, r->rv30,
             r->rv30 ? r->s.dsp.put_rv30_tpel_pixels_tab
-                    : r->s.dsp.put_h264_qpel_pixels_tab,
-            r->s.dsp.put_h264_chroma_pixels_tab);
+                    : r->s.dsp.put_rv40_qpel_pixels_tab,
+            r->rv30 ? r->s.dsp.put_h264_chroma_pixels_tab
+                    : r->s.dsp.put_rv40_chroma_pixels_tab);
     rv34_mc(r, block_type, 0, 0, 0, 2, 2, 1, r->rv30,
             r->rv30 ? r->s.dsp.avg_rv30_tpel_pixels_tab
-                    : r->s.dsp.avg_h264_qpel_pixels_tab,
-            r->s.dsp.avg_h264_chroma_pixels_tab);
+                    : r->s.dsp.avg_rv40_qpel_pixels_tab,
+            r->rv30 ? r->s.dsp.avg_h264_chroma_pixels_tab
+                    : r->s.dsp.avg_rv40_chroma_pixels_tab);
 }
 
 static void rv34_mc_2mv_skip(RV34DecContext *r)
@@ -727,12 +736,14 @@ static void rv34_mc_2mv_skip(RV34DecContext *r)
         for(i = 0; i < 2; i++){
              rv34_mc(r, RV34_MB_P_8x8, i*8, j*8, i+j*r->s.b8_stride, 1, 1, 0, r->rv30,
                     r->rv30 ? r->s.dsp.put_rv30_tpel_pixels_tab
-                            : r->s.dsp.put_h264_qpel_pixels_tab,
-                    r->s.dsp.put_h264_chroma_pixels_tab);
+                            : r->s.dsp.put_rv40_qpel_pixels_tab,
+                    r->rv30 ? r->s.dsp.put_h264_chroma_pixels_tab
+                            : r->s.dsp.put_rv40_chroma_pixels_tab);
              rv34_mc(r, RV34_MB_P_8x8, i*8, j*8, i+j*r->s.b8_stride, 1, 1, 1, r->rv30,
                     r->rv30 ? r->s.dsp.avg_rv30_tpel_pixels_tab
-                            : r->s.dsp.avg_h264_qpel_pixels_tab,
-                    r->s.dsp.avg_h264_chroma_pixels_tab);
+                            : r->s.dsp.avg_rv40_qpel_pixels_tab,
+                    r->rv30 ? r->s.dsp.avg_h264_chroma_pixels_tab
+                            : r->s.dsp.avg_rv40_chroma_pixels_tab);
         }
 }
 
@@ -953,11 +964,11 @@ static void rv34_output_macroblock(RV34DecContext *r, int8_t *intra_types, int c
         itype = ittrans16[intra_types[0]];
         itype = adjust_pred16(itype, r->avail_cache[5-4], r->avail_cache[5-1]);
         r->h.pred16x16[itype](Y, s->linesize);
-        dsp->add_pixels_clamped(s->block[0], Y,     s->current_picture.linesize[0]);
-        dsp->add_pixels_clamped(s->block[1], Y + 8, s->current_picture.linesize[0]);
-        Y += s->current_picture.linesize[0] * 8;
-        dsp->add_pixels_clamped(s->block[2], Y,     s->current_picture.linesize[0]);
-        dsp->add_pixels_clamped(s->block[3], Y + 8, s->current_picture.linesize[0]);
+        dsp->add_pixels_clamped(s->block[0], Y,     s->linesize);
+        dsp->add_pixels_clamped(s->block[1], Y + 8, s->linesize);
+        Y += s->linesize * 8;
+        dsp->add_pixels_clamped(s->block[2], Y,     s->linesize);
+        dsp->add_pixels_clamped(s->block[3], Y + 8, s->linesize);
 
         itype = ittrans16[intra_types[0]];
         if(itype == PLANE_PRED8x8) itype = DC_PRED8x8;
@@ -1046,7 +1057,7 @@ static int rv34_decode_mb_header(RV34DecContext *r, int8_t *intra_types)
  * mask for retrieving all bits in coded block pattern
  * corresponding to one 8x8 block
  */
-#define LUMA_CBP_BLOCK_MASK 0x303
+#define LUMA_CBP_BLOCK_MASK 0x33
 
 #define U_CBP_MASK 0x0F0000
 #define V_CBP_MASK 0xF00000
@@ -1059,7 +1070,7 @@ static void rv34_apply_differences(RV34DecContext *r, int cbp)
     int i;
 
     for(i = 0; i < 4; i++)
-        if(cbp & (LUMA_CBP_BLOCK_MASK << shifts[i]))
+        if((cbp & (LUMA_CBP_BLOCK_MASK << shifts[i])) || r->block_type == RV34_MB_P_MIX16x16)
             s->dsp.add_pixels_clamped(s->block[i], s->dest[0] + (i & 1)*8 + (i&2)*4*s->linesize, s->linesize);
     if(cbp & U_CBP_MASK)
         s->dsp.add_pixels_clamped(s->block[4], s->dest[1], s->uvlinesize);
@@ -1089,7 +1100,7 @@ static int rv34_set_deblock_coef(RV34DecContext *r)
         for(i = 0; i < 2; i++){
             if(is_mv_diff_gt_3(motion_val + i, 1))
                 vmvmask |= 0x11 << (j + i*2);
-            if(is_mv_diff_gt_3(motion_val + i, s->b8_stride))
+            if((j || s->mb_y) && is_mv_diff_gt_3(motion_val + i, s->b8_stride))
                 hmvmask |= 0x03 << (j + i*2);
         }
         motion_val += s->b8_stride;
@@ -1098,7 +1109,15 @@ static int rv34_set_deblock_coef(RV34DecContext *r)
         hmvmask &= ~0x000F;
     if(!s->mb_x)
         vmvmask &= ~0x1111;
-    return hmvmask | vmvmask; //XXX: should be stored separately for RV3
+    if(r->rv30){ //RV30 marks both subblocks on the edge for filtering
+        vmvmask |= (vmvmask & 0x4444) >> 1;
+        hmvmask |= (hmvmask & 0x0F00) >> 4;
+        if(s->mb_x)
+            r->deblock_coefs[s->mb_x - 1 + s->mb_y*s->mb_stride] |= (vmvmask & 0x1111) << 3;
+        if(!s->first_slice_line)
+            r->deblock_coefs[s->mb_x + (s->mb_y - 1)*s->mb_stride] |= (hmvmask & 0xF) << 12;
+    }
+    return hmvmask | vmvmask;
 }
 
 static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
@@ -1129,13 +1148,13 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
 
     s->qscale = r->si.quant;
     cbp = cbp2 = rv34_decode_mb_header(r, intra_types);
-    r->cbp_luma  [s->mb_x + s->mb_y * s->mb_stride] = cbp;
-    r->cbp_chroma[s->mb_x + s->mb_y * s->mb_stride] = cbp >> 16;
+    r->cbp_luma  [mb_pos] = cbp;
+    r->cbp_chroma[mb_pos] = cbp >> 16;
     if(s->pict_type == FF_I_TYPE)
-        r->deblock_coefs[mb_pos] = 0;
+        r->deblock_coefs[mb_pos] = 0xFFFF;
     else
-        r->deblock_coefs[mb_pos] = rv34_set_deblock_coef(r);
-    s->current_picture.qscale_table[s->mb_x + s->mb_y * s->mb_stride] = s->qscale;
+        r->deblock_coefs[mb_pos] = rv34_set_deblock_coef(r) | r->cbp_luma[mb_pos];
+    s->current_picture_ptr->qscale_table[mb_pos] = s->qscale;
 
     if(cbp == -1)
         return -1;
@@ -1169,7 +1188,7 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
         rv34_dequant4x4(s->block[blknum] + blkoff, rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]],rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]]);
         rv34_inv_transform(s->block[blknum] + blkoff);
     }
-    if(IS_INTRA(s->current_picture_ptr->mb_type[s->mb_x + s->mb_y*s->mb_stride]))
+    if(IS_INTRA(s->current_picture_ptr->mb_type[mb_pos]))
         rv34_output_macroblock(r, intra_types, cbp2, r->is16);
     else
         rv34_apply_differences(r, cbp2);
@@ -1201,7 +1220,7 @@ static inline int slice_compare(SliceInfo *si1, SliceInfo *si2)
            si1->pts    != si2->pts;
 }
 
-static int rv34_decode_slice(RV34DecContext *r, int end, uint8_t* buf, int buf_size)
+static int rv34_decode_slice(RV34DecContext *r, int end, const uint8_t* buf, int buf_size)
 {
     MpegEncContext *s = &r->s;
     GetBitContext *gb = &s->gb;
@@ -1234,7 +1253,6 @@ static int rv34_decode_slice(RV34DecContext *r, int end, uint8_t* buf, int buf_s
         if(MPV_frame_start(s, s->avctx) < 0)
             return -1;
         ff_er_frame_start(s);
-        s->current_picture_ptr = &s->current_picture;
         r->cur_pts = r->si.pts;
         if(s->pict_type != FF_B_TYPE){
             r->last_pts = r->next_pts;
@@ -1276,6 +1294,9 @@ static int rv34_decode_slice(RV34DecContext *r, int end, uint8_t* buf, int buf_s
 
             memmove(r->intra_types_hist, r->intra_types, s->b4_stride * 4 * sizeof(*r->intra_types_hist));
             memset(r->intra_types, -1, s->b4_stride * 4 * sizeof(*r->intra_types_hist));
+
+            if(r->loop_filter && s->mb_y >= 2)
+                r->loop_filter(r, s->mb_y - 2);
         }
         if(s->mb_x == s->resync_mb_x)
             s->first_slice_line=0;
@@ -1331,7 +1352,7 @@ av_cold int ff_rv34_decode_init(AVCodecContext *avctx)
     return 0;
 }
 
-static int get_slice_offset(AVCodecContext *avctx, uint8_t *buf, int n)
+static int get_slice_offset(AVCodecContext *avctx, const uint8_t *buf, int n)
 {
     if(avctx->slice_count) return avctx->slice_offset[n];
     else                   return AV_RL32(buf + n*8 - 4) == 1 ? AV_RL32(buf + n*8) :  AV_RB32(buf + n*8);
@@ -1339,7 +1360,7 @@ static int get_slice_offset(AVCodecContext *avctx, uint8_t *buf, int n)
 
 int ff_rv34_decode_frame(AVCodecContext *avctx,
                             void *data, int *data_size,
-                            uint8_t *buf, int buf_size)
+                            const uint8_t *buf, int buf_size)
 {
     RV34DecContext *r = avctx->priv_data;
     MpegEncContext *s = &r->s;
@@ -1347,7 +1368,7 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
     SliceInfo si;
     int i;
     int slice_count;
-    uint8_t *slices_hdr = NULL;
+    const uint8_t *slices_hdr = NULL;
     int last = 0;
 
     /* no supplementary picture */
@@ -1377,6 +1398,11 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
         else
             size= get_slice_offset(avctx, slices_hdr, i+1) - offset;
 
+        if(offset > buf_size){
+            av_log(avctx, AV_LOG_ERROR, "Slice offset is greater than frame size\n");
+            break;
+        }
+
         r->si.end = s->mb_width * s->mb_height;
         if(i+1 < slice_count){
             init_get_bits(&s->gb, buf+get_slice_offset(avctx, slices_hdr, i+1), (buf_size-get_slice_offset(avctx, slices_hdr, i+1))*8);
@@ -1388,6 +1414,8 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
             }else
                 r->si.end = si.start;
         }
+        if(!i && si.type == FF_B_TYPE && (!s->last_picture_ptr || !s->last_picture_ptr->data[0]))
+            return -1;
         last = rv34_decode_slice(r, r->si.end, buf + offset, size);
         s->mb_num_left = r->s.mb_x + r->s.mb_y*r->s.mb_width - r->si.start;
         if(last)
@@ -1396,7 +1424,7 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
 
     if(last){
         if(r->loop_filter)
-            r->loop_filter(r);
+            r->loop_filter(r, s->mb_height - 1);
         ff_er_frame_end(s);
         MPV_frame_end(s);
         if (s->pict_type == FF_B_TYPE || s->low_delay) {
diff --git a/libavcodec/rv34.h b/libavcodec/rv34.h
index 16e0faa..65dbb8a 100644
--- a/libavcodec/rv34.h
+++ b/libavcodec/rv34.h
@@ -115,7 +115,7 @@ typedef struct RV34DecContext{
     int (*parse_slice_header)(struct RV34DecContext *r, GetBitContext *gb, SliceInfo *si);
     int (*decode_mb_info)(struct RV34DecContext *r);
     int (*decode_intra_types)(struct RV34DecContext *r, GetBitContext *gb, int8_t *dst);
-    void (*loop_filter)(struct RV34DecContext *r);
+    void (*loop_filter)(struct RV34DecContext *r, int row);
 }RV34DecContext;
 
 /**
@@ -123,7 +123,7 @@ typedef struct RV34DecContext{
  */
 int ff_rv34_get_start_offset(GetBitContext *gb, int blocks);
 int ff_rv34_decode_init(AVCodecContext *avctx);
-int ff_rv34_decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size);
+int ff_rv34_decode_frame(AVCodecContext *avctx, void *data, int *data_size, const uint8_t *buf, int buf_size);
 int ff_rv34_decode_end(AVCodecContext *avctx);
 
 #endif /* AVCODEC_RV34_H */
diff --git a/libavcodec/rv34data.h b/libavcodec/rv34data.h
index e4862a3..95e5572 100644
--- a/libavcodec/rv34data.h
+++ b/libavcodec/rv34data.h
@@ -138,7 +138,7 @@ static const uint8_t rv34_dquant_tab[2][32]={
  * maximum number of macroblocks for each of the possible slice offset sizes
  * @todo This is the same as ff_mba_max, maybe use it instead.
  */
-static const uint16_t rv34_mb_max_sizes[6] = { 0x2F, 0x68, 0x18B, 0x62F, 0x18BF, 0x23FF };
+static const uint16_t rv34_mb_max_sizes[6] = { 0x2F, 0x62, 0x18B, 0x62F, 0x18BF, 0x23FF };
 /**
  * bits needed to code the slice offset for the given size
  * @todo This is the same as ff_mba_length, maybe use it instead.
diff --git a/libavcodec/rv40.c b/libavcodec/rv40.c
index 2d52967..083de1b 100644
--- a/libavcodec/rv40.c
+++ b/libavcodec/rv40.c
@@ -247,6 +247,383 @@ static int rv40_decode_mb_info(RV34DecContext *r)
     return 0;
 }
 
+#define CLIP_SYMM(a, b) av_clip(a, -(b), b)
+/**
+ * weaker deblocking very similar to the one described in 4.4.2 of JVT-A003r1
+ */
+static inline void rv40_weak_loop_filter(uint8_t *src, const int step,
+                                         const int filter_p1, const int filter_q1,
+                                         const int alpha, const int beta,
+                                         const int lim_p0q0,
+                                         const int lim_q1, const int lim_p1,
+                                         const int diff_p1p0, const int diff_q1q0,
+                                         const int diff_p1p2, const int diff_q1q2)
+{
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int t, u, diff;
+
+    t = src[0*step] - src[-1*step];
+    if(!t)
+        return;
+    u = (alpha * FFABS(t)) >> 7;
+    if(u > 3 - (filter_p1 && filter_q1))
+        return;
+
+    t <<= 2;
+    if(filter_p1 && filter_q1)
+        t += src[-2*step] - src[1*step];
+    diff = CLIP_SYMM((t + 4) >> 3, lim_p0q0);
+    src[-1*step] = cm[src[-1*step] + diff];
+    src[ 0*step] = cm[src[ 0*step] - diff];
+    if(FFABS(diff_p1p2) <= beta && filter_p1){
+        t = (diff_p1p0 + diff_p1p2 - diff) >> 1;
+        src[-2*step] = cm[src[-2*step] - CLIP_SYMM(t, lim_p1)];
+    }
+    if(FFABS(diff_q1q2) <= beta && filter_q1){
+        t = (diff_q1q0 + diff_q1q2 + diff) >> 1;
+        src[ 1*step] = cm[src[ 1*step] - CLIP_SYMM(t, lim_q1)];
+    }
+}
+
+static inline void rv40_adaptive_loop_filter(uint8_t *src, const int step,
+                                             const int stride, const int dmode,
+                                             const int lim_q1, const int lim_p1,
+                                             const int alpha,
+                                             const int beta, const int beta2,
+                                             const int chroma, const int edge)
+{
+    int diff_p1p0[4], diff_q1q0[4], diff_p1p2[4], diff_q1q2[4];
+    int sum_p1p0 = 0, sum_q1q0 = 0, sum_p1p2 = 0, sum_q1q2 = 0;
+    uint8_t *ptr;
+    int flag_strong0 = 1, flag_strong1 = 1;
+    int filter_p1, filter_q1;
+    int i;
+    int lims;
+
+    for(i = 0, ptr = src; i < 4; i++, ptr += stride){
+        diff_p1p0[i] = ptr[-2*step] - ptr[-1*step];
+        diff_q1q0[i] = ptr[ 1*step] - ptr[ 0*step];
+        sum_p1p0 += diff_p1p0[i];
+        sum_q1q0 += diff_q1q0[i];
+    }
+    filter_p1 = FFABS(sum_p1p0) < (beta<<2);
+    filter_q1 = FFABS(sum_q1q0) < (beta<<2);
+    if(!filter_p1 && !filter_q1)
+        return;
+
+    for(i = 0, ptr = src; i < 4; i++, ptr += stride){
+        diff_p1p2[i] = ptr[-2*step] - ptr[-3*step];
+        diff_q1q2[i] = ptr[ 1*step] - ptr[ 2*step];
+        sum_p1p2 += diff_p1p2[i];
+        sum_q1q2 += diff_q1q2[i];
+    }
+
+    if(edge){
+        flag_strong0 = filter_p1 && (FFABS(sum_p1p2) < beta2);
+        flag_strong1 = filter_q1 && (FFABS(sum_q1q2) < beta2);
+    }else{
+        flag_strong0 = flag_strong1 = 0;
+    }
+
+    lims = filter_p1 + filter_q1 + ((lim_q1 + lim_p1) >> 1) + 1;
+    if(flag_strong0 && flag_strong1){ /* strong filtering */
+        for(i = 0; i < 4; i++, src += stride){
+            int sflag, p0, q0, p1, q1;
+            int t = src[0*step] - src[-1*step];
+
+            if(!t) continue;
+            sflag = (alpha * FFABS(t)) >> 7;
+            if(sflag > 1) continue;
+
+            p0 = (25*src[-3*step] + 26*src[-2*step]
+                + 26*src[-1*step]
+                + 26*src[ 0*step] + 25*src[ 1*step] + rv40_dither_l[dmode + i]) >> 7;
+            q0 = (25*src[-2*step] + 26*src[-1*step]
+                + 26*src[ 0*step]
+                + 26*src[ 1*step] + 25*src[ 2*step] + rv40_dither_r[dmode + i]) >> 7;
+            if(sflag){
+                p0 = av_clip(p0, src[-1*step] - lims, src[-1*step] + lims);
+                q0 = av_clip(q0, src[ 0*step] - lims, src[ 0*step] + lims);
+            }
+            p1 = (25*src[-4*step] + 26*src[-3*step]
+                + 26*src[-2*step]
+                + 26*p0           + 25*src[ 0*step] + rv40_dither_l[dmode + i]) >> 7;
+            q1 = (25*src[-1*step] + 26*q0
+                + 26*src[ 1*step]
+                + 26*src[ 2*step] + 25*src[ 3*step] + rv40_dither_r[dmode + i]) >> 7;
+            if(sflag){
+                p1 = av_clip(p1, src[-2*step] - lims, src[-2*step] + lims);
+                q1 = av_clip(q1, src[ 1*step] - lims, src[ 1*step] + lims);
+            }
+            src[-2*step] = p1;
+            src[-1*step] = p0;
+            src[ 0*step] = q0;
+            src[ 1*step] = q1;
+            if(!chroma){
+                src[-3*step] = (25*src[-1*step] + 26*src[-2*step] + 51*src[-3*step] + 26*src[-4*step] + 64) >> 7;
+                src[ 2*step] = (25*src[ 0*step] + 26*src[ 1*step] + 51*src[ 2*step] + 26*src[ 3*step] + 64) >> 7;
+            }
+        }
+    }else if(filter_p1 && filter_q1){
+        for(i = 0; i < 4; i++, src += stride)
+            rv40_weak_loop_filter(src, step, 1, 1, alpha, beta, lims, lim_q1, lim_p1,
+                                  diff_p1p0[i], diff_q1q0[i], diff_p1p2[i], diff_q1q2[i]);
+    }else{
+        for(i = 0; i < 4; i++, src += stride)
+            rv40_weak_loop_filter(src, step, filter_p1, filter_q1,
+                                  alpha, beta, lims>>1, lim_q1>>1, lim_p1>>1,
+                                  diff_p1p0[i], diff_q1q0[i], diff_p1p2[i], diff_q1q2[i]);
+    }
+}
+
+static void rv40_v_loop_filter(uint8_t *src, int stride, int dmode,
+                               int lim_q1, int lim_p1,
+                               int alpha, int beta, int beta2, int chroma, int edge){
+    rv40_adaptive_loop_filter(src, 1, stride, dmode, lim_q1, lim_p1,
+                              alpha, beta, beta2, chroma, edge);
+}
+static void rv40_h_loop_filter(uint8_t *src, int stride, int dmode,
+                               int lim_q1, int lim_p1,
+                               int alpha, int beta, int beta2, int chroma, int edge){
+    rv40_adaptive_loop_filter(src, stride, 1, dmode, lim_q1, lim_p1,
+                              alpha, beta, beta2, chroma, edge);
+}
+
+enum RV40BlockPos{
+    POS_CUR,
+    POS_TOP,
+    POS_LEFT,
+    POS_BOTTOM,
+};
+
+#define MASK_CUR          0x0001
+#define MASK_RIGHT        0x0008
+#define MASK_BOTTOM       0x0010
+#define MASK_TOP          0x1000
+#define MASK_Y_TOP_ROW    0x000F
+#define MASK_Y_LAST_ROW   0xF000
+#define MASK_Y_LEFT_COL   0x1111
+#define MASK_Y_RIGHT_COL  0x8888
+#define MASK_C_TOP_ROW    0x0003
+#define MASK_C_LAST_ROW   0x000C
+#define MASK_C_LEFT_COL   0x0005
+#define MASK_C_RIGHT_COL  0x000A
+
+static const int neighbour_offs_x[4] = { 0,  0, -1, 0 };
+static const int neighbour_offs_y[4] = { 0, -1,  0, 1 };
+
+/**
+ * RV40 loop filtering function
+ */
+static void rv40_loop_filter(RV34DecContext *r, int row)
+{
+    MpegEncContext *s = &r->s;
+    int mb_pos, mb_x;
+    int i, j, k;
+    uint8_t *Y, *C;
+    int alpha, beta, betaY, betaC;
+    int q;
+    int mbtype[4];   ///< current macroblock and its neighbours types
+    /**
+     * flags indicating that macroblock can be filtered with strong filter
+     * it is set only for intra coded MB and MB with DCs coded separately
+     */
+    int mb_strong[4];
+    int clip[4];     ///< MB filter clipping value calculated from filtering strength
+    /**
+     * coded block patterns for luma part of current macroblock and its neighbours
+     * Format:
+     * LSB corresponds to the top left block,
+     * each nibble represents one row of subblocks.
+     */
+    int cbp[4];
+    /**
+     * coded block patterns for chroma part of current macroblock and its neighbours
+     * Format is the same as for luma with two subblocks in a row.
+     */
+    int uvcbp[4][2];
+    /**
+     * This mask represents the pattern of luma subblocks that should be filtered
+     * in addition to the coded ones because because they lie at the edge of
+     * 8x8 block with different enough motion vectors
+     */
+    int mvmasks[4];
+
+    mb_pos = row * s->mb_stride;
+    for(mb_x = 0; mb_x < s->mb_width; mb_x++, mb_pos++){
+        int mbtype = s->current_picture_ptr->mb_type[mb_pos];
+        if(IS_INTRA(mbtype) || IS_SEPARATE_DC(mbtype))
+            r->cbp_luma  [mb_pos] = r->deblock_coefs[mb_pos] = 0xFFFF;
+        if(IS_INTRA(mbtype))
+            r->cbp_chroma[mb_pos] = 0xFF;
+    }
+    mb_pos = row * s->mb_stride;
+    for(mb_x = 0; mb_x < s->mb_width; mb_x++, mb_pos++){
+        int y_h_deblock, y_v_deblock;
+        int c_v_deblock[2], c_h_deblock[2];
+        int clip_left;
+        int avail[4];
+        int y_to_deblock, c_to_deblock[2];
+
+        q = s->current_picture_ptr->qscale_table[mb_pos];
+        alpha = rv40_alpha_tab[q];
+        beta  = rv40_beta_tab [q];
+        betaY = betaC = beta * 3;
+        if(s->width * s->height <= 176*144)
+            betaY += beta;
+
+        avail[0] = 1;
+        avail[1] = row;
+        avail[2] = mb_x;
+        avail[3] = row < s->mb_height - 1;
+        for(i = 0; i < 4; i++){
+            if(avail[i]){
+                int pos = mb_pos + neighbour_offs_x[i] + neighbour_offs_y[i]*s->mb_stride;
+                mvmasks[i] = r->deblock_coefs[pos];
+                mbtype [i] = s->current_picture_ptr->mb_type[pos];
+                cbp    [i] = r->cbp_luma[pos];
+                uvcbp[i][0] = r->cbp_chroma[pos] & 0xF;
+                uvcbp[i][1] = r->cbp_chroma[pos] >> 4;
+            }else{
+                mvmasks[i] = 0;
+                mbtype [i] = mbtype[0];
+                cbp    [i] = 0;
+                uvcbp[i][0] = uvcbp[i][1] = 0;
+            }
+            mb_strong[i] = IS_INTRA(mbtype[i]) || IS_SEPARATE_DC(mbtype[i]);
+            clip[i] = rv40_filter_clip_tbl[mb_strong[i] + 1][q];
+        }
+        y_to_deblock =  mvmasks[POS_CUR]
+                     | (mvmasks[POS_BOTTOM] << 16);
+        /* This pattern contains bits signalling that horizontal edges of
+         * the current block can be filtered.
+         * That happens when either of adjacent subblocks is coded or lies on
+         * the edge of 8x8 blocks with motion vectors differing by more than
+         * 3/4 pel in any component (any edge orientation for some reason).
+         */
+        y_h_deblock =   y_to_deblock
+                    | ((cbp[POS_CUR]                           <<  4) & ~MASK_Y_TOP_ROW)
+                    | ((cbp[POS_TOP]        & MASK_Y_LAST_ROW) >> 12);
+        /* This pattern contains bits signalling that vertical edges of
+         * the current block can be filtered.
+         * That happens when either of adjacent subblocks is coded or lies on
+         * the edge of 8x8 blocks with motion vectors differing by more than
+         * 3/4 pel in any component (any edge orientation for some reason).
+         */
+        y_v_deblock =   y_to_deblock
+                    | ((cbp[POS_CUR]                      << 1) & ~MASK_Y_LEFT_COL)
+                    | ((cbp[POS_LEFT] & MASK_Y_RIGHT_COL) >> 3);
+        if(!mb_x)
+            y_v_deblock &= ~MASK_Y_LEFT_COL;
+        if(!row)
+            y_h_deblock &= ~MASK_Y_TOP_ROW;
+        if(row == s->mb_height - 1 || (mb_strong[POS_CUR] || mb_strong[POS_BOTTOM]))
+            y_h_deblock &= ~(MASK_Y_TOP_ROW << 16);
+        /* Calculating chroma patterns is similar and easier since there is
+         * no motion vector pattern for them.
+         */
+        for(i = 0; i < 2; i++){
+            c_to_deblock[i] = (uvcbp[POS_BOTTOM][i] << 4) | uvcbp[POS_CUR][i];
+            c_v_deblock[i] =   c_to_deblock[i]
+                           | ((uvcbp[POS_CUR] [i]                       << 1) & ~MASK_C_LEFT_COL)
+                           | ((uvcbp[POS_LEFT][i]   & MASK_C_RIGHT_COL) >> 1);
+            c_h_deblock[i] =   c_to_deblock[i]
+                           | ((uvcbp[POS_TOP][i]    & MASK_C_LAST_ROW)  >> 2)
+                           |  (uvcbp[POS_CUR][i]                        << 2);
+            if(!mb_x)
+                c_v_deblock[i] &= ~MASK_C_LEFT_COL;
+            if(!row)
+                c_h_deblock[i] &= ~MASK_C_TOP_ROW;
+            if(row == s->mb_height - 1 || mb_strong[POS_CUR] || mb_strong[POS_BOTTOM])
+                c_h_deblock[i] &= ~(MASK_C_TOP_ROW << 4);
+        }
+
+        for(j = 0; j < 16; j += 4){
+            Y = s->current_picture_ptr->data[0] + mb_x*16 + (row*16 + j) * s->linesize;
+            for(i = 0; i < 4; i++, Y += 4){
+                int ij = i + j;
+                int clip_cur = y_to_deblock & (MASK_CUR << ij) ? clip[POS_CUR] : 0;
+                int dither = j ? ij : i*4;
+
+                // if bottom block is coded then we can filter its top edge
+                // (or bottom edge of this block, which is the same)
+                if(y_h_deblock & (MASK_BOTTOM << ij)){
+                    rv40_h_loop_filter(Y+4*s->linesize, s->linesize, dither,
+                                       y_to_deblock & (MASK_BOTTOM << ij) ? clip[POS_CUR] : 0,
+                                       clip_cur,
+                                       alpha, beta, betaY, 0, 0);
+                }
+                // filter left block edge in ordinary mode (with low filtering strength)
+                if(y_v_deblock & (MASK_CUR << ij) && (i || !(mb_strong[POS_CUR] || mb_strong[POS_LEFT]))){
+                    if(!i)
+                        clip_left = mvmasks[POS_LEFT] & (MASK_RIGHT << j) ? clip[POS_LEFT] : 0;
+                    else
+                        clip_left = y_to_deblock & (MASK_CUR << (ij-1)) ? clip[POS_CUR] : 0;
+                    rv40_v_loop_filter(Y, s->linesize, dither,
+                                       clip_cur,
+                                       clip_left,
+                                       alpha, beta, betaY, 0, 0);
+                }
+                // filter top edge of the current macroblock when filtering strength is high
+                if(!j && y_h_deblock & (MASK_CUR << i) && (mb_strong[POS_CUR] || mb_strong[POS_TOP])){
+                    rv40_h_loop_filter(Y, s->linesize, dither,
+                                       clip_cur,
+                                       mvmasks[POS_TOP] & (MASK_TOP << i) ? clip[POS_TOP] : 0,
+                                       alpha, beta, betaY, 0, 1);
+                }
+                // filter left block edge in edge mode (with high filtering strength)
+                if(y_v_deblock & (MASK_CUR << ij) && !i && (mb_strong[POS_CUR] || mb_strong[POS_LEFT])){
+                    clip_left = mvmasks[POS_LEFT] & (MASK_RIGHT << j) ? clip[POS_LEFT] : 0;
+                    rv40_v_loop_filter(Y, s->linesize, dither,
+                                       clip_cur,
+                                       clip_left,
+                                       alpha, beta, betaY, 0, 1);
+                }
+            }
+        }
+        for(k = 0; k < 2; k++){
+            for(j = 0; j < 2; j++){
+                C = s->current_picture_ptr->data[k+1] + mb_x*8 + (row*8 + j*4) * s->uvlinesize;
+                for(i = 0; i < 2; i++, C += 4){
+                    int ij = i + j*2;
+                    int clip_cur = c_to_deblock[k] & (MASK_CUR << ij) ? clip[POS_CUR] : 0;
+                    if(c_h_deblock[k] & (MASK_CUR << (ij+2))){
+                        int clip_bot = c_to_deblock[k] & (MASK_CUR << (ij+2)) ? clip[POS_CUR] : 0;
+                        rv40_h_loop_filter(C+4*s->uvlinesize, s->uvlinesize, i*8,
+                                           clip_bot,
+                                           clip_cur,
+                                           alpha, beta, betaC, 1, 0);
+                    }
+                    if((c_v_deblock[k] & (MASK_CUR << ij)) && (i || !(mb_strong[POS_CUR] || mb_strong[POS_LEFT]))){
+                        if(!i)
+                            clip_left = uvcbp[POS_LEFT][k] & (MASK_CUR << (2*j+1)) ? clip[POS_LEFT] : 0;
+                        else
+                            clip_left = c_to_deblock[k]    & (MASK_CUR << (ij-1))  ? clip[POS_CUR]  : 0;
+                        rv40_v_loop_filter(C, s->uvlinesize, j*8,
+                                           clip_cur,
+                                           clip_left,
+                                           alpha, beta, betaC, 1, 0);
+                    }
+                    if(!j && c_h_deblock[k] & (MASK_CUR << ij) && (mb_strong[POS_CUR] || mb_strong[POS_TOP])){
+                        int clip_top = uvcbp[POS_TOP][k] & (MASK_CUR << (ij+2)) ? clip[POS_TOP] : 0;
+                        rv40_h_loop_filter(C, s->uvlinesize, i*8,
+                                           clip_cur,
+                                           clip_top,
+                                           alpha, beta, betaC, 1, 1);
+                    }
+                    if(c_v_deblock[k] & (MASK_CUR << ij) && !i && (mb_strong[POS_CUR] || mb_strong[POS_LEFT])){
+                        clip_left = uvcbp[POS_LEFT][k] & (MASK_CUR << (2*j+1)) ? clip[POS_LEFT] : 0;
+                        rv40_v_loop_filter(C, s->uvlinesize, j*8,
+                                           clip_cur,
+                                           clip_left,
+                                           alpha, beta, betaC, 1, 1);
+                    }
+                }
+            }
+        }
+    }
+}
+
 /**
  * Initialize decoder.
  */
@@ -261,6 +638,7 @@ static av_cold int rv40_decode_init(AVCodecContext *avctx)
     r->parse_slice_header = rv40_parse_slice_header;
     r->decode_intra_types = rv40_decode_intra_types;
     r->decode_mb_info     = rv40_decode_mb_info;
+    r->loop_filter        = rv40_loop_filter;
     r->luma_dc_quant_i = rv40_luma_dc_quant[0];
     r->luma_dc_quant_p = rv40_luma_dc_quant[1];
     return 0;
diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
new file mode 100644
index 0000000..b48c4e8
--- /dev/null
+++ b/libavcodec/rv40dsp.c
@@ -0,0 +1,353 @@
+/*
+ * RV40 decoder motion compensation functions
+ * Copyright (c) 2008 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file rv40dsp.c
+ * RV40 decoder motion compensation functions
+ */
+
+#include "avcodec.h"
+#include "dsputil.h"
+
+#define RV40_LOWPASS(OPNAME, OP) \
+static av_unused void OPNAME ## rv40_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,\
+                                                     const int h, const int C1, const int C2, const int SHIFT){\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<h; i++)\
+    {\
+        OP(dst[0], (src[-2] + src[ 3] - 5*(src[-1]+src[2]) + src[0]*C1 + src[1]*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[1], (src[-1] + src[ 4] - 5*(src[ 0]+src[3]) + src[1]*C1 + src[2]*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[2], (src[ 0] + src[ 5] - 5*(src[ 1]+src[4]) + src[2]*C1 + src[3]*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[3], (src[ 1] + src[ 6] - 5*(src[ 2]+src[5]) + src[3]*C1 + src[4]*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[4], (src[ 2] + src[ 7] - 5*(src[ 3]+src[6]) + src[4]*C1 + src[5]*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[5], (src[ 3] + src[ 8] - 5*(src[ 4]+src[7]) + src[5]*C1 + src[6]*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[6], (src[ 4] + src[ 9] - 5*(src[ 5]+src[8]) + src[6]*C1 + src[7]*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[7], (src[ 5] + src[10] - 5*(src[ 6]+src[9]) + src[7]*C1 + src[8]*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}\
+\
+static void OPNAME ## rv40_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,\
+                                           const int w, const int C1, const int C2, const int SHIFT){\
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
+    int i;\
+    for(i=0; i<w; i++)\
+    {\
+        const int srcB = src[-2*srcStride];\
+        const int srcA = src[-1*srcStride];\
+        const int src0 = src[0 *srcStride];\
+        const int src1 = src[1 *srcStride];\
+        const int src2 = src[2 *srcStride];\
+        const int src3 = src[3 *srcStride];\
+        const int src4 = src[4 *srcStride];\
+        const int src5 = src[5 *srcStride];\
+        const int src6 = src[6 *srcStride];\
+        const int src7 = src[7 *srcStride];\
+        const int src8 = src[8 *srcStride];\
+        const int src9 = src[9 *srcStride];\
+        const int src10= src[10*srcStride];\
+        OP(dst[0*dstStride], (srcB + src3  - 5*(srcA+src2) + src0*C1 + src1*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[1*dstStride], (srcA + src4  - 5*(src0+src3) + src1*C1 + src2*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[2*dstStride], (src0 + src5  - 5*(src1+src4) + src2*C1 + src3*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[3*dstStride], (src1 + src6  - 5*(src2+src5) + src3*C1 + src4*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[4*dstStride], (src2 + src7  - 5*(src3+src6) + src4*C1 + src5*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[5*dstStride], (src3 + src8  - 5*(src4+src7) + src5*C1 + src6*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[6*dstStride], (src4 + src9  - 5*(src5+src8) + src6*C1 + src7*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        OP(dst[7*dstStride], (src5 + src10 - 5*(src6+src9) + src7*C1 + src8*C2 + (1<<(SHIFT-1))) >> SHIFT);\
+        dst++;\
+        src++;\
+    }\
+}\
+\
+static void OPNAME ## rv40_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,\
+                                            const int w, const int C1, const int C2, const int SHIFT){\
+    OPNAME ## rv40_qpel8_v_lowpass(dst  , src  , dstStride, srcStride, 8, C1, C2, SHIFT);\
+    OPNAME ## rv40_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride, 8, C1, C2, SHIFT);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## rv40_qpel8_v_lowpass(dst  , src  , dstStride, srcStride, w-8, C1, C2, SHIFT);\
+    OPNAME ## rv40_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride, w-8, C1, C2, SHIFT);\
+}\
+\
+static void OPNAME ## rv40_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,\
+                                            const int h, const int C1, const int C2, const int SHIFT){\
+    OPNAME ## rv40_qpel8_h_lowpass(dst  , src  , dstStride, srcStride, 8, C1, C2, SHIFT);\
+    OPNAME ## rv40_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride, 8, C1, C2, SHIFT);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## rv40_qpel8_h_lowpass(dst  , src  , dstStride, srcStride, h-8, C1, C2, SHIFT);\
+    OPNAME ## rv40_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride, h-8, C1, C2, SHIFT);\
+}\
+\
+
+#define RV40_MC(OPNAME, SIZE) \
+static void OPNAME ## rv40_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## rv40_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride, SIZE, 52, 20, 6);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## rv40_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride, SIZE, 20, 20, 5);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## rv40_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride, SIZE, 20, 52, 6);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, src, stride, stride, SIZE, 52, 20, 6);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 52, 20, 6);\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 52, 20, 6);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 20, 5);\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 52, 20, 6);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 52, 6);\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 52, 20, 6);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, src, stride, stride, SIZE, 20, 20, 5);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 52, 20, 6);\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 20, 5);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 20, 5);\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 20, 5);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 52, 6);\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 20, 5);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, src, stride, stride, SIZE, 20, 52, 6);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 52, 20, 6);\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 52, 6);\
+}\
+\
+static void OPNAME ## rv40_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
+    uint8_t full[SIZE*(SIZE+5)];\
+    uint8_t * const full_mid= full + SIZE*2;\
+    put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 20, 5);\
+    OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 52, 6);\
+}\
+\
+
+#define op_avg(a, b)  a = (((a)+cm[b]+1)>>1)
+#define op_put(a, b)  a = cm[b]
+
+RV40_LOWPASS(put_       , op_put)
+RV40_LOWPASS(avg_       , op_avg)
+
+#undef op_avg
+#undef op_put
+
+RV40_MC(put_, 8)
+RV40_MC(put_, 16)
+RV40_MC(avg_, 8)
+RV40_MC(avg_, 16)
+
+static const int rv40_bias[4][4] = {
+    {  0, 16, 32, 16 },
+    { 32, 28, 32, 28 },
+    {  0, 32, 16, 32 },
+    { 32, 28, 32, 28 }
+};
+
+#define RV40_CHROMA_MC(OPNAME, OP)\
+static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    int bias = rv40_bias[y>>1][x>>1];\
+    \
+    assert(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + bias));\
+            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + bias));\
+            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + bias));\
+            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + bias));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }else{\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0] + bias));\
+            OP(dst[1], (A*src[1] + E*src[step+1] + bias));\
+            OP(dst[2], (A*src[2] + E*src[step+2] + bias));\
+            OP(dst[3], (A*src[3] + E*src[step+3] + bias));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}\
+\
+static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+    const int A=(8-x)*(8-y);\
+    const int B=(  x)*(8-y);\
+    const int C=(8-x)*(  y);\
+    const int D=(  x)*(  y);\
+    int i;\
+    int bias = rv40_bias[y>>1][x>>1];\
+    \
+    assert(x<8 && y<8 && x>=0 && y>=0);\
+\
+    if(D){\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + bias));\
+            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + bias));\
+            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + bias));\
+            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + bias));\
+            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + bias));\
+            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + bias));\
+            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + bias));\
+            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + bias));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }else{\
+        const int E= B+C;\
+        const int step= C ? stride : 1;\
+        for(i=0; i<h; i++){\
+            OP(dst[0], (A*src[0] + E*src[step+0] + bias));\
+            OP(dst[1], (A*src[1] + E*src[step+1] + bias));\
+            OP(dst[2], (A*src[2] + E*src[step+2] + bias));\
+            OP(dst[3], (A*src[3] + E*src[step+3] + bias));\
+            OP(dst[4], (A*src[4] + E*src[step+4] + bias));\
+            OP(dst[5], (A*src[5] + E*src[step+5] + bias));\
+            OP(dst[6], (A*src[6] + E*src[step+6] + bias));\
+            OP(dst[7], (A*src[7] + E*src[step+7] + bias));\
+            dst+= stride;\
+            src+= stride;\
+        }\
+    }\
+}
+
+#define op_avg(a, b) a = (((a)+((b)>>6)+1)>>1)
+#define op_put(a, b) a = ((b)>>6)
+
+RV40_CHROMA_MC(put_, op_put)
+RV40_CHROMA_MC(avg_, op_avg)
+
+void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx) {
+    c->put_rv40_qpel_pixels_tab[0][ 0] = c->put_h264_qpel_pixels_tab[0][0];
+    c->put_rv40_qpel_pixels_tab[0][ 1] = put_rv40_qpel16_mc10_c;
+    c->put_rv40_qpel_pixels_tab[0][ 2] = put_rv40_qpel16_mc20_c;
+    c->put_rv40_qpel_pixels_tab[0][ 3] = put_rv40_qpel16_mc30_c;
+    c->put_rv40_qpel_pixels_tab[0][ 4] = put_rv40_qpel16_mc01_c;
+    c->put_rv40_qpel_pixels_tab[0][ 5] = put_rv40_qpel16_mc11_c;
+    c->put_rv40_qpel_pixels_tab[0][ 6] = put_rv40_qpel16_mc21_c;
+    c->put_rv40_qpel_pixels_tab[0][ 7] = put_rv40_qpel16_mc31_c;
+    c->put_rv40_qpel_pixels_tab[0][ 8] = put_rv40_qpel16_mc02_c;
+    c->put_rv40_qpel_pixels_tab[0][ 9] = put_rv40_qpel16_mc12_c;
+    c->put_rv40_qpel_pixels_tab[0][10] = put_rv40_qpel16_mc22_c;
+    c->put_rv40_qpel_pixels_tab[0][11] = put_rv40_qpel16_mc32_c;
+    c->put_rv40_qpel_pixels_tab[0][12] = put_rv40_qpel16_mc03_c;
+    c->put_rv40_qpel_pixels_tab[0][13] = put_rv40_qpel16_mc13_c;
+    c->put_rv40_qpel_pixels_tab[0][14] = put_rv40_qpel16_mc23_c;
+    c->avg_rv40_qpel_pixels_tab[0][ 0] = c->avg_h264_qpel_pixels_tab[0][0];
+    c->avg_rv40_qpel_pixels_tab[0][ 1] = avg_rv40_qpel16_mc10_c;
+    c->avg_rv40_qpel_pixels_tab[0][ 2] = avg_rv40_qpel16_mc20_c;
+    c->avg_rv40_qpel_pixels_tab[0][ 3] = avg_rv40_qpel16_mc30_c;
+    c->avg_rv40_qpel_pixels_tab[0][ 4] = avg_rv40_qpel16_mc01_c;
+    c->avg_rv40_qpel_pixels_tab[0][ 5] = avg_rv40_qpel16_mc11_c;
+    c->avg_rv40_qpel_pixels_tab[0][ 6] = avg_rv40_qpel16_mc21_c;
+    c->avg_rv40_qpel_pixels_tab[0][ 7] = avg_rv40_qpel16_mc31_c;
+    c->avg_rv40_qpel_pixels_tab[0][ 8] = avg_rv40_qpel16_mc02_c;
+    c->avg_rv40_qpel_pixels_tab[0][ 9] = avg_rv40_qpel16_mc12_c;
+    c->avg_rv40_qpel_pixels_tab[0][10] = avg_rv40_qpel16_mc22_c;
+    c->avg_rv40_qpel_pixels_tab[0][11] = avg_rv40_qpel16_mc32_c;
+    c->avg_rv40_qpel_pixels_tab[0][12] = avg_rv40_qpel16_mc03_c;
+    c->avg_rv40_qpel_pixels_tab[0][13] = avg_rv40_qpel16_mc13_c;
+    c->avg_rv40_qpel_pixels_tab[0][14] = avg_rv40_qpel16_mc23_c;
+    c->put_rv40_qpel_pixels_tab[1][ 0] = c->put_h264_qpel_pixels_tab[1][0];
+    c->put_rv40_qpel_pixels_tab[1][ 1] = put_rv40_qpel8_mc10_c;
+    c->put_rv40_qpel_pixels_tab[1][ 2] = put_rv40_qpel8_mc20_c;
+    c->put_rv40_qpel_pixels_tab[1][ 3] = put_rv40_qpel8_mc30_c;
+    c->put_rv40_qpel_pixels_tab[1][ 4] = put_rv40_qpel8_mc01_c;
+    c->put_rv40_qpel_pixels_tab[1][ 5] = put_rv40_qpel8_mc11_c;
+    c->put_rv40_qpel_pixels_tab[1][ 6] = put_rv40_qpel8_mc21_c;
+    c->put_rv40_qpel_pixels_tab[1][ 7] = put_rv40_qpel8_mc31_c;
+    c->put_rv40_qpel_pixels_tab[1][ 8] = put_rv40_qpel8_mc02_c;
+    c->put_rv40_qpel_pixels_tab[1][ 9] = put_rv40_qpel8_mc12_c;
+    c->put_rv40_qpel_pixels_tab[1][10] = put_rv40_qpel8_mc22_c;
+    c->put_rv40_qpel_pixels_tab[1][11] = put_rv40_qpel8_mc32_c;
+    c->put_rv40_qpel_pixels_tab[1][12] = put_rv40_qpel8_mc03_c;
+    c->put_rv40_qpel_pixels_tab[1][13] = put_rv40_qpel8_mc13_c;
+    c->put_rv40_qpel_pixels_tab[1][14] = put_rv40_qpel8_mc23_c;
+    c->avg_rv40_qpel_pixels_tab[1][ 0] = c->avg_h264_qpel_pixels_tab[1][0];
+    c->avg_rv40_qpel_pixels_tab[1][ 1] = avg_rv40_qpel8_mc10_c;
+    c->avg_rv40_qpel_pixels_tab[1][ 2] = avg_rv40_qpel8_mc20_c;
+    c->avg_rv40_qpel_pixels_tab[1][ 3] = avg_rv40_qpel8_mc30_c;
+    c->avg_rv40_qpel_pixels_tab[1][ 4] = avg_rv40_qpel8_mc01_c;
+    c->avg_rv40_qpel_pixels_tab[1][ 5] = avg_rv40_qpel8_mc11_c;
+    c->avg_rv40_qpel_pixels_tab[1][ 6] = avg_rv40_qpel8_mc21_c;
+    c->avg_rv40_qpel_pixels_tab[1][ 7] = avg_rv40_qpel8_mc31_c;
+    c->avg_rv40_qpel_pixels_tab[1][ 8] = avg_rv40_qpel8_mc02_c;
+    c->avg_rv40_qpel_pixels_tab[1][ 9] = avg_rv40_qpel8_mc12_c;
+    c->avg_rv40_qpel_pixels_tab[1][10] = avg_rv40_qpel8_mc22_c;
+    c->avg_rv40_qpel_pixels_tab[1][11] = avg_rv40_qpel8_mc32_c;
+    c->avg_rv40_qpel_pixels_tab[1][12] = avg_rv40_qpel8_mc03_c;
+    c->avg_rv40_qpel_pixels_tab[1][13] = avg_rv40_qpel8_mc13_c;
+    c->avg_rv40_qpel_pixels_tab[1][14] = avg_rv40_qpel8_mc23_c;
+
+    c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_c;
+    c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_c;
+    c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_c;
+    c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_c;
+}
diff --git a/libavcodec/sh4/dsputil_align.c b/libavcodec/sh4/dsputil_align.c
index 7c49a06..d9286e1 100644
--- a/libavcodec/sh4/dsputil_align.c
+++ b/libavcodec/sh4/dsputil_align.c
@@ -272,36 +272,36 @@ static void op##_##rnd##_pixels##sz##_##xy (uint8_t * dest, const uint8_t * ref,
 
 #define OP put
 
-DEFFUNC(put,   rnd,o,8,OP_C,avg2)
-DEFFUNC(put,   rnd,x,8,OP_X,avg2)
-DEFFUNC(put,no_rnd,x,8,OP_X,avg2)
-DEFFUNC(put,   rnd,y,8,OP_Y,avg2)
-DEFFUNC(put,no_rnd,y,8,OP_Y,avg2)
+DEFFUNC(put,   rnd,o,8,OP_C,avg32)
+DEFFUNC(put,   rnd,x,8,OP_X,avg32)
+DEFFUNC(put,no_rnd,x,8,OP_X,avg32)
+DEFFUNC(put,   rnd,y,8,OP_Y,avg32)
+DEFFUNC(put,no_rnd,y,8,OP_Y,avg32)
 DEFFUNC(put,   rnd,xy,8,OP_XY,PACK)
 DEFFUNC(put,no_rnd,xy,8,OP_XY,PACK)
-DEFFUNC(put,   rnd,o,16,OP_C,avg2)
-DEFFUNC(put,   rnd,x,16,OP_X,avg2)
-DEFFUNC(put,no_rnd,x,16,OP_X,avg2)
-DEFFUNC(put,   rnd,y,16,OP_Y,avg2)
-DEFFUNC(put,no_rnd,y,16,OP_Y,avg2)
+DEFFUNC(put,   rnd,o,16,OP_C,avg32)
+DEFFUNC(put,   rnd,x,16,OP_X,avg32)
+DEFFUNC(put,no_rnd,x,16,OP_X,avg32)
+DEFFUNC(put,   rnd,y,16,OP_Y,avg32)
+DEFFUNC(put,no_rnd,y,16,OP_Y,avg32)
 DEFFUNC(put,   rnd,xy,16,OP_XY,PACK)
 DEFFUNC(put,no_rnd,xy,16,OP_XY,PACK)
 
 #undef OP
 #define OP avg
 
-DEFFUNC(avg,   rnd,o,8,OP_C,avg2)
-DEFFUNC(avg,   rnd,x,8,OP_X,avg2)
-DEFFUNC(avg,no_rnd,x,8,OP_X,avg2)
-DEFFUNC(avg,   rnd,y,8,OP_Y,avg2)
-DEFFUNC(avg,no_rnd,y,8,OP_Y,avg2)
+DEFFUNC(avg,   rnd,o,8,OP_C,avg32)
+DEFFUNC(avg,   rnd,x,8,OP_X,avg32)
+DEFFUNC(avg,no_rnd,x,8,OP_X,avg32)
+DEFFUNC(avg,   rnd,y,8,OP_Y,avg32)
+DEFFUNC(avg,no_rnd,y,8,OP_Y,avg32)
 DEFFUNC(avg,   rnd,xy,8,OP_XY,PACK)
 DEFFUNC(avg,no_rnd,xy,8,OP_XY,PACK)
-DEFFUNC(avg,   rnd,o,16,OP_C,avg2)
-DEFFUNC(avg,   rnd,x,16,OP_X,avg2)
-DEFFUNC(avg,no_rnd,x,16,OP_X,avg2)
-DEFFUNC(avg,   rnd,y,16,OP_Y,avg2)
-DEFFUNC(avg,no_rnd,y,16,OP_Y,avg2)
+DEFFUNC(avg,   rnd,o,16,OP_C,avg32)
+DEFFUNC(avg,   rnd,x,16,OP_X,avg32)
+DEFFUNC(avg,no_rnd,x,16,OP_X,avg32)
+DEFFUNC(avg,   rnd,y,16,OP_Y,avg32)
+DEFFUNC(avg,no_rnd,y,16,OP_Y,avg32)
 DEFFUNC(avg,   rnd,xy,16,OP_XY,PACK)
 DEFFUNC(avg,no_rnd,xy,16,OP_XY,PACK)
 
@@ -370,22 +370,22 @@ void dsputil_init_align(DSPContext* c, AVCodecContext *avctx)
 #ifdef QPEL
 
 #define dspfunc(PFX, IDX, NUM) \
-    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
-    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
-    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
-    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
-    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
-    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
-    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
-    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
-    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
-    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
-    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
-    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
-    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
-    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
-    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
-    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
+    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_sh4; \
+    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_sh4; \
+    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_sh4; \
+    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_sh4; \
+    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_sh4; \
+    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_sh4; \
+    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_sh4; \
+    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_sh4; \
+    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_sh4; \
+    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_sh4; \
+    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_sh4; \
+    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_sh4; \
+    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_sh4; \
+    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_sh4; \
+    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_sh4; \
+    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_sh4
 
     dspfunc(put_qpel, 0, 16);
     dspfunc(put_no_rnd_qpel, 0, 16);
@@ -407,21 +407,21 @@ void dsputil_init_align(DSPContext* c, AVCodecContext *avctx)
     dspfunc(avg_h264_qpel, 2, 4);
 
 #undef dspfunc
-    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
-    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
-    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
-    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
-    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
-    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
-
-    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
-    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
-    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
-    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
-    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
-    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
-    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
-    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
+    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_sh4;
+    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_sh4;
+    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_sh4;
+    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_sh4;
+    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_sh4;
+    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_sh4;
+
+    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_sh4;
+    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_sh4;
+    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_sh4;
+    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_sh4;
+    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_sh4;
+    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_sh4;
+    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_sh4;
+    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_sh4;
 
     c->gmc1 = gmc1_c;
     c->gmc = gmc_c;
diff --git a/libavcodec/sh4/dsputil_sh4.c b/libavcodec/sh4/dsputil_sh4.c
index e7e2de6..2c86a7a 100644
--- a/libavcodec/sh4/dsputil_sh4.c
+++ b/libavcodec/sh4/dsputil_sh4.c
@@ -22,16 +22,15 @@
 
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
+#include "sh4.h"
 
 static void memzero_align8(void *dst,size_t size)
 {
-#if defined(__SH4__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
-        (char*)dst+=size;
-        size/=8*4;
-        __asm__(
-#if defined(__SH4__)
-        " fschg\n"  //single float mode
-#endif
+        int fpscr;
+        fp_single_enter(fpscr);
+        dst = (char *)dst + size;
+        size /= 32;
+        __asm__ volatile (
         " fldi0 fr0\n"
         " fldi0 fr1\n"
         " fschg\n"  // double
@@ -42,35 +41,22 @@ static void memzero_align8(void *dst,size_t size)
         " fmov  dr0, at -%0\n"
         " bf.s 1b\n"
         " fmov  dr0, at -%0\n"
-#if defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
         " fschg" //back to single
-#endif
-        : : "r"(dst),"r"(size): "memory" );
-#else
-        double *d = dst;
-        size/=8*4;
-        do {
-                d[0] = 0.0;
-                d[1] = 0.0;
-                d[2] = 0.0;
-                d[3] = 0.0;
-                d+=4;
-        } while(--size);
-#endif
+        : "+r"(dst),"+r"(size) :: "memory" );
+        fp_single_leave(fpscr);
 }
 
 static void clear_blocks_sh4(DCTELEM *blocks)
 {
-//        if (((int)blocks&7)==0)
         memzero_align8(blocks,sizeof(DCTELEM)*6*64);
 }
 
-extern void idct_sh4(DCTELEM *block);
+void idct_sh4(DCTELEM *block);
 static void idct_put(uint8_t *dest, int line_size, DCTELEM *block)
 {
-        idct_sh4(block);
         int i;
         uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+        idct_sh4(block);
         for(i=0;i<8;i++) {
                 dest[0] = cm[block[0]];
                 dest[1] = cm[block[1]];
@@ -86,9 +72,9 @@ static void idct_put(uint8_t *dest, int line_size, DCTELEM *block)
 }
 static void idct_add(uint8_t *dest, int line_size, DCTELEM *block)
 {
-        idct_sh4(block);
         int i;
         uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+        idct_sh4(block);
         for(i=0;i<8;i++) {
                 dest[0] = cm[dest[0]+block[0]];
                 dest[1] = cm[dest[1]+block[1]];
@@ -103,7 +89,7 @@ static void idct_add(uint8_t *dest, int line_size, DCTELEM *block)
         }
 }
 
-extern void dsputil_init_align(DSPContext* c, AVCodecContext *avctx);
+void dsputil_init_align(DSPContext* c, AVCodecContext *avctx);
 
 void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx)
 {
@@ -115,6 +101,6 @@ void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx)
                 c->idct_put = idct_put;
                 c->idct_add = idct_add;
                c->idct     = idct_sh4;
-                c->idct_permutation_type= FF_NO_IDCT_PERM; //FF_SIMPLE_IDCT_PERM; //FF_LIBMPEG2_IDCT_PERM;
+                c->idct_permutation_type= FF_NO_IDCT_PERM;
         }
 }
diff --git a/libavcodec/sh4/idct_sh4.c b/libavcodec/sh4/idct_sh4.c
index b684e8f..5c461e7 100644
--- a/libavcodec/sh4/idct_sh4.c
+++ b/libavcodec/sh4/idct_sh4.c
@@ -21,6 +21,8 @@
  */
 
 #include "libavcodec/dsputil.h"
+#include "sh4.h"
+
 #define c1      1.38703984532214752434  /* sqrt(2)*cos(1*pi/16) */
 #define c2      1.30656296487637657577  /* sqrt(2)*cos(2*pi/16) */
 #define c3      1.17587560241935884520  /* sqrt(2)*cos(3*pi/16) */
@@ -51,9 +53,11 @@ static const float odd_table[] __attribute__ ((aligned(8))) = {
 #undef  c6
 #undef  c7
 
-#if defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
+#if 1
 
 #define         load_matrix(table) \
+    do { \
+        const float *t = table; \
         __asm__ volatile( \
         "       fschg\n" \
         "       fmov   @%0+,xd0\n" \
@@ -65,15 +69,13 @@ static const float odd_table[] __attribute__ ((aligned(8))) = {
         "       fmov   @%0+,xd12\n" \
         "       fmov   @%0+,xd14\n" \
         "       fschg\n" \
-        :\
-        : "r"(table)\
-        : "0" \
-        )
+        : "+r"(t) \
+        ); \
+    } while (0)
 
 #define         ftrv() \
                 __asm__ volatile("ftrv xmtrx,fv0" \
-                : "=f"(fr0),"=f"(fr1),"=f"(fr2),"=f"(fr3) \
-                :  "0"(fr0), "1"(fr1), "2"(fr2), "3"(fr3) );
+                : "+f"(fr0),"+f"(fr1),"+f"(fr2),"+f"(fr3));
 
 #define         DEFREG        \
         register float fr0 __asm__("fr0"); \
@@ -136,10 +138,9 @@ void idct_sh4(DCTELEM *block)
         int i;
         float        tblock[8*8],*fblock;
         int ofs1,ofs2,ofs3;
+        int fpscr;
 
-#if defined(__SH4__)
-#error  "FIXME!! change to single float"
-#endif
+        fp_single_enter(fpscr);
 
         /* row */
 
@@ -168,10 +169,6 @@ void idct_sh4(DCTELEM *block)
 
         i = 8;
 
-//        ofs1 = sizeof(float)*1;
-//        ofs2 = sizeof(float)*2;
-//        ofs3 = sizeof(float)*3;
-
         do {
                 float t0,t1,t2,t3;
                 fr0 = block[1];
@@ -252,9 +249,7 @@ void idct_sh4(DCTELEM *block)
                 block++;
         } while(--i);
 
-#if defined(__SH4__)
-#error  "FIXME!! change to double"
-#endif
+        fp_single_leave(fpscr);
 }
 #else
 void idct_sh4(DCTELEM *block)
diff --git a/libavcodec/sh4/qpel.c b/libavcodec/sh4/qpel.c
index a75d22f..2069bd3 100644
--- a/libavcodec/sh4/qpel.c
+++ b/libavcodec/sh4/qpel.c
@@ -22,63 +22,6 @@
  */
 
 #define PIXOP2(OPNAME, OP) \
-/*static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
-{\
-        do {\
-                OP(LP(dst  ),no_rnd_avg32(AV_RN32(src1  ),AV_RN32(src2  )) ); \
-                OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),AV_RN32(src2+4)) ); \
-                src1+=src_stride1; \
-                src2+=src_stride2; \
-                dst+=dst_stride; \
-        } while(--h); \
-}\
-\
-static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
-{\
-        do {\
-                OP(LP(dst  ),rnd_avg32(AV_RN32(src1  ),AV_RN32(src2  )) ); \
-                OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),AV_RN32(src2+4)) ); \
-                src1+=src_stride1; \
-                src2+=src_stride2; \
-                dst+=dst_stride; \
-        } while(--h); \
-}\
-\
-static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
-{\
-        do {\
-                OP(LP(dst  ),rnd_avg32(AV_RN32(src1  ),AV_RN32(src2  )) ); \
-                src1+=src_stride1; \
-                src2+=src_stride2; \
-                dst+=dst_stride; \
-        } while(--h); \
-}\
-\
-static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
-{\
-        do {\
-                OP(LP(dst  ),no_rnd_avg32(AV_RN32(src1  ),AV_RN32(src2  )) ); \
-                OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),AV_RN32(src2+4)) ); \
-                OP(LP(dst+8),no_rnd_avg32(AV_RN32(src1+8),AV_RN32(src2+8)) ); \
-                OP(LP(dst+12),no_rnd_avg32(AV_RN32(src1+12),AV_RN32(src2+12)) ); \
-                src1+=src_stride1; \
-                src2+=src_stride2; \
-                dst+=dst_stride; \
-        } while(--h); \
-}\
-\
-static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
-{\
-        do {\
-                OP(LP(dst  ),rnd_avg32(AV_RN32(src1  ),AV_RN32(src2  )) ); \
-                OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),AV_RN32(src2+4)) ); \
-                OP(LP(dst+8),rnd_avg32(AV_RN32(src1+8),AV_RN32(src2+8)) ); \
-                OP(LP(dst+12),rnd_avg32(AV_RN32(src1+12),AV_RN32(src2+12)) ); \
-                src1+=src_stride1; \
-                src2+=src_stride2; \
-                dst+=dst_stride; \
-        } while(--h); \
-}*/\
 \
 static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
 {\
@@ -472,7 +415,7 @@ static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
     }
 }
 #define H264_CHROMA_MC(OPNAME, OP)\
-static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+static void OPNAME ## h264_chroma_mc2_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
     const int A=(8-x)*(8-y);\
     const int B=(  x)*(8-y);\
     const int C=(8-x)*(  y);\
@@ -494,7 +437,7 @@ static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
     }while(--h);\
 }\
 \
-static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+static void OPNAME ## h264_chroma_mc4_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
     const int A=(8-x)*(8-y);\
     const int B=(  x)*(8-y);\
     const int C=(8-x)*(  y);\
@@ -520,7 +463,7 @@ static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*a
     }while(--h);\
 }\
 \
-static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
+static void OPNAME ## h264_chroma_mc8_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
     const int A=(8-x)*(8-y);\
     const int B=(  x)*(8-y);\
     const int C=(8-x)*(  y);\
@@ -707,27 +650,27 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dst
     }while(--w);\
 }\
 \
-static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## pixels8_c(dst, src, stride, 8);\
 }\
 \
-static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t half[64];\
     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
     OPNAME ## pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);\
 }\
 \
-static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
 }\
 \
-static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t half[64];\
     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
     OPNAME ## pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);\
 }\
 \
-static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[16*9];\
     uint8_t half[64];\
     copy_block9(full, src, 16, stride, 9);\
@@ -735,31 +678,20 @@ static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## pixels8_l2_aligned(dst, full, half, stride, 16, 8, 8);\
 }\
 \
-static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[16*9];\
     copy_block9(full, src, 16, stride, 9);\
     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
 }\
 \
-static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[16*9];\
     uint8_t half[64];\
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
     OPNAME ## pixels8_l2_aligned(dst, full+16, half, stride, 16, 8, 8);\
 }\
-static void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4_aligned(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
@@ -769,18 +701,7 @@ static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
 }\
-static void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4_aligned0(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
@@ -790,18 +711,7 @@ static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
 }\
-static void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4_aligned(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
@@ -811,18 +721,7 @@ static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 }\
-static void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l4_aligned0(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
@@ -832,32 +731,21 @@ static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
 }\
-static void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     copy_block9(full, src, 16, stride, 9);\
@@ -865,18 +753,7 @@ static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
 }\
-static void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[16*9];\
-    uint8_t halfH[72];\
-    uint8_t halfV[64];\
-    uint8_t halfHV[64];\
-    copy_block9(full, src, 16, stride, 9);\
-    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
-    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
-    OPNAME ## pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);\
-}\
-static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     copy_block9(full, src, 16, stride, 9);\
@@ -884,32 +761,32 @@ static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
 }\
-static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t halfH[72];\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
 }\
-static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## pixels16_c(dst, src, stride, 16);\
 }\
 \
-static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t half[256];\
     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
     OPNAME ## pixels16_l2_aligned2(dst, src, half, stride, stride, 16, 16);\
 }\
 \
-static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
 }\
 \
-static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t half[256];\
     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
     OPNAME ## pixels16_l2_aligned2(dst, src+1, half, stride, stride, 16, 16);\
 }\
 \
-static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[24*17];\
     uint8_t half[256];\
     copy_block17(full, src, 24, stride, 17);\
@@ -917,31 +794,20 @@ static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## pixels16_l2_aligned(dst, full, half, stride, 24, 16, 16);\
 }\
 \
-static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[24*17];\
     copy_block17(full, src, 24, stride, 17);\
     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
 }\
 \
-static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[24*17];\
     uint8_t half[256];\
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
     OPNAME ## pixels16_l2_aligned(dst, full+24, half, stride, 24, 16, 16);\
 }\
-static void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4_aligned(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
@@ -951,18 +817,7 @@ static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
 }\
-static void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4_aligned0(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
@@ -972,18 +827,7 @@ static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
 }\
-static void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4_aligned(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
@@ -993,18 +837,7 @@ static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
 }\
-static void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l4_aligned0(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
@@ -1014,32 +847,21 @@ static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
 }\
-static void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_aligned(dst, halfV, halfHV, stride, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     copy_block17(full, src, 24, stride, 17);\
@@ -1047,18 +869,7 @@ static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
 }\
-static void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
-    uint8_t full[24*17];\
-    uint8_t halfH[272];\
-    uint8_t halfV[256];\
-    uint8_t halfHV[256];\
-    copy_block17(full, src, 24, stride, 17);\
-    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
-    OPNAME ## pixels16_l2_aligned(dst, halfV, halfHV, stride, 16, 16, 16);\
-}\
-static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     copy_block17(full, src, 24, stride, 17);\
@@ -1066,7 +877,7 @@ static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
     put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
 }\
-static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## qpel16_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t halfH[272];\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
@@ -1332,27 +1143,27 @@ static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t
 }\
 
 #define H264_MC(OPNAME, SIZE) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t half[SIZE*SIZE];\
     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
     OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src, half, stride, stride, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t half[SIZE*SIZE];\
     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
     OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src+1, half, stride, stride, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[SIZE*(SIZE+5)];\
     uint8_t * const full_mid= full + SIZE*2;\
     uint8_t half[SIZE*SIZE];\
@@ -1361,14 +1172,14 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, i
     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[SIZE*(SIZE+5)];\
     uint8_t * const full_mid= full + SIZE*2;\
     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[SIZE*(SIZE+5)];\
     uint8_t * const full_mid= full + SIZE*2;\
     uint8_t half[SIZE*SIZE];\
@@ -1377,7 +1188,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, i
     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[SIZE*(SIZE+5)];\
     uint8_t * const full_mid= full + SIZE*2;\
     uint8_t halfH[SIZE*SIZE];\
@@ -1388,7 +1199,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, i
     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[SIZE*(SIZE+5)];\
     uint8_t * const full_mid= full + SIZE*2;\
     uint8_t halfH[SIZE*SIZE];\
@@ -1399,7 +1210,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, i
     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[SIZE*(SIZE+5)];\
     uint8_t * const full_mid= full + SIZE*2;\
     uint8_t halfH[SIZE*SIZE];\
@@ -1410,7 +1221,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, i
     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[SIZE*(SIZE+5)];\
     uint8_t * const full_mid= full + SIZE*2;\
     uint8_t halfH[SIZE*SIZE];\
@@ -1421,12 +1232,12 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, i
     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
     int16_t tmp[SIZE*(SIZE+5)];\
     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
     int16_t tmp[SIZE*(SIZE+5)];\
     uint8_t halfH[SIZE*SIZE];\
     uint8_t halfHV[SIZE*SIZE];\
@@ -1435,7 +1246,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, i
     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
     int16_t tmp[SIZE*(SIZE+5)];\
     uint8_t halfH[SIZE*SIZE];\
     uint8_t halfHV[SIZE*SIZE];\
@@ -1444,7 +1255,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, i
     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[SIZE*(SIZE+5)];\
     uint8_t * const full_mid= full + SIZE*2;\
     int16_t tmp[SIZE*(SIZE+5)];\
@@ -1456,7 +1267,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, i
     OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
 }\
 \
-static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
     uint8_t full[SIZE*(SIZE+5)];\
     uint8_t * const full_mid= full + SIZE*2;\
     int16_t tmp[SIZE*(SIZE+5)];\
@@ -1549,31 +1360,31 @@ static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int
     }while(--w);
 }
 
-static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
+static void put_mspel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){
     put_pixels8_c(dst, src, stride, 8);
 }
 
-static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
+static void put_mspel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){
     uint8_t half[64];
     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
     put_pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);
 }
 
-static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
+static void put_mspel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){
     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
 }
 
-static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
+static void put_mspel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){
     uint8_t half[64];
     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
     put_pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);
 }
 
-static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
+static void put_mspel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){
     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
 }
 
-static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
+static void put_mspel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){
     uint8_t halfH[88];
     uint8_t halfV[64];
     uint8_t halfHV[64];
@@ -1582,7 +1393,7 @@ static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
     put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
 }
-static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
+static void put_mspel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){
     uint8_t halfH[88];
     uint8_t halfV[64];
     uint8_t halfHV[64];
@@ -1591,7 +1402,7 @@ static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
     put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
 }
-static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
+static void put_mspel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){
     uint8_t halfH[88];
     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
diff --git a/libavcodec/sh4/sh4.h b/libavcodec/sh4/sh4.h
new file mode 100644
index 0000000..5d46540
--- /dev/null
+++ b/libavcodec/sh4/sh4.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_SH4_SH4_H
+#define AVCODEC_SH4_SH4_H
+
+#ifdef __SH4__
+#   define fp_single_enter(fpscr)                               \
+    do {                                                        \
+        __asm__ volatile ("sts   fpscr, %0     \n\t"            \
+                          "and   %1,    %0     \n\t"            \
+                          "lds   %0,    fpscr  \n\t"            \
+                          : "=&r"(fpscr) : "r"(~(1<<19)));      \
+    } while (0)
+
+#   define fp_single_leave(fpscr)                       \
+    do {                                                \
+        __asm__ volatile ("or    %1,    %0     \n\t"    \
+                          "lds   %0,    fpscr  \n\t"    \
+                          : "+r"(fpscr) : "r"(1<<19));  \
+    } while (0)
+#else
+#   define fp_single_enter(fpscr) ((void)fpscr)
+#   define fp_single_leave(fpscr)
+#endif
+
+#endif /* AVCODEC_SH4_SH4_H */
diff --git a/libavcodec/simple_idct.c b/libavcodec/simple_idct.c
index 62f5b24..2cf9e8d 100644
--- a/libavcodec/simple_idct.c
+++ b/libavcodec/simple_idct.c
@@ -31,6 +31,7 @@
  */
 #include "avcodec.h"
 #include "dsputil.h"
+#include "mathops.h"
 #include "simple_idct.h"
 
 #if 0
@@ -55,26 +56,6 @@
 #define COL_SHIFT 20 // 6
 #endif
 
-#if defined(ARCH_POWERPC_405)
-
-/* signed 16x16 -> 32 multiply add accumulate */
-#define MAC16(rt, ra, rb) \
-    __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
-
-/* signed 16x16 -> 32 multiply */
-#define MUL16(rt, ra, rb) \
-    __asm__ ("mullhw %0, %1, %2" : "=r" (rt) : "r" (ra), "r" (rb));
-
-#else
-
-/* signed 16x16 -> 32 multiply add accumulate */
-#define MAC16(rt, ra, rb) rt += (ra) * (rb)
-
-/* signed 16x16 -> 32 multiply */
-#define MUL16(rt, ra, rb) rt = (ra) * (rb)
-
-#endif
-
 static inline void idctRowCondDC (DCTELEM * row)
 {
         int a0, a1, a2, a3, b0, b1, b2, b3;
@@ -137,13 +118,13 @@ static inline void idctRowCondDC (DCTELEM * row)
         a2 -= W6 * row[2];
         a3 -= W2 * row[2];
 
-        MUL16(b0, W1, row[1]);
+        b0 = MUL16(W1, row[1]);
         MAC16(b0, W3, row[3]);
-        MUL16(b1, W3, row[1]);
+        b1 = MUL16(W3, row[1]);
         MAC16(b1, -W7, row[3]);
-        MUL16(b2, W5, row[1]);
+        b2 = MUL16(W5, row[1]);
         MAC16(b2, -W1, row[3]);
-        MUL16(b3, W7, row[1]);
+        b3 = MUL16(W7, row[1]);
         MAC16(b3, -W5, row[3]);
 
 #ifdef HAVE_FAST_64BIT
@@ -197,10 +178,10 @@ static inline void idctSparseColPut (uint8_t *dest, int line_size,
         a2 +=  - W6*col[8*2];
         a3 +=  - W2*col[8*2];
 
-        MUL16(b0, W1, col[8*1]);
-        MUL16(b1, W3, col[8*1]);
-        MUL16(b2, W5, col[8*1]);
-        MUL16(b3, W7, col[8*1]);
+        b0 = MUL16(W1, col[8*1]);
+        b1 = MUL16(W3, col[8*1]);
+        b2 = MUL16(W5, col[8*1]);
+        b3 = MUL16(W7, col[8*1]);
 
         MAC16(b0, + W3, col[8*3]);
         MAC16(b1, - W7, col[8*3]);
@@ -269,10 +250,10 @@ static inline void idctSparseColAdd (uint8_t *dest, int line_size,
         a2 +=  - W6*col[8*2];
         a3 +=  - W2*col[8*2];
 
-        MUL16(b0, W1, col[8*1]);
-        MUL16(b1, W3, col[8*1]);
-        MUL16(b2, W5, col[8*1]);
-        MUL16(b3, W7, col[8*1]);
+        b0 = MUL16(W1, col[8*1]);
+        b1 = MUL16(W3, col[8*1]);
+        b2 = MUL16(W5, col[8*1]);
+        b3 = MUL16(W7, col[8*1]);
 
         MAC16(b0, + W3, col[8*3]);
         MAC16(b1, - W7, col[8*3]);
@@ -339,10 +320,10 @@ static inline void idctSparseCol (DCTELEM * col)
         a2 +=  - W6*col[8*2];
         a3 +=  - W2*col[8*2];
 
-        MUL16(b0, W1, col[8*1]);
-        MUL16(b1, W3, col[8*1]);
-        MUL16(b2, W5, col[8*1]);
-        MUL16(b3, W7, col[8*1]);
+        b0 = MUL16(W1, col[8*1]);
+        b1 = MUL16(W3, col[8*1]);
+        b2 = MUL16(W5, col[8*1]);
+        b3 = MUL16(W7, col[8*1]);
 
         MAC16(b0, + W3, col[8*3]);
         MAC16(b1, - W7, col[8*3]);
diff --git a/libavcodec/smacker.c b/libavcodec/smacker.c
index 1690518..bcdf792 100644
--- a/libavcodec/smacker.c
+++ b/libavcodec/smacker.c
@@ -457,8 +457,8 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, const
                 case 2:
                     for(i = 0; i < 2; i++) {
                         uint16_t pix1, pix2;
-                        pix1 = smk_get_code(&gb, smk->full_tbl, smk->full_last);
                         pix2 = smk_get_code(&gb, smk->full_tbl, smk->full_last);
+                        pix1 = smk_get_code(&gb, smk->full_tbl, smk->full_last);
                         AV_WL16(out,pix1);
                         AV_WL16(out+2,pix2);
                         out += stride;
@@ -559,6 +559,7 @@ static av_cold int decode_end(AVCodecContext *avctx)
 static av_cold int smka_decode_init(AVCodecContext *avctx)
 {
     avctx->sample_fmt = SAMPLE_FMT_S16;
+    avctx->channel_layout = (avctx->channels==2) ? CH_LAYOUT_STEREO : CH_LAYOUT_MONO;
     return 0;
 }
 
diff --git a/libavcodec/snow.c b/libavcodec/snow.c
index c5c73b1..e3e5aa9 100644
--- a/libavcodec/snow.c
+++ b/libavcodec/snow.c
@@ -488,6 +488,8 @@ typedef struct SnowContext{
     slice_buffer sb;
 
     MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to eventually make the motion estimation independent of MpegEncContext, so this will be removed then (FIXME/XXX)
+
+    uint8_t *scratchbuf;
 }SnowContext;
 
 typedef struct {
@@ -496,7 +498,7 @@ typedef struct {
     IDWTELEM *b2;
     IDWTELEM *b3;
     int y;
-} dwt_compose_t;
+} DWTCompose;
 
 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
 //#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
@@ -1078,19 +1080,19 @@ static void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int
     }
 }
 
-static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
+static void spatial_compose53i_buffered_init(DWTCompose *cs, slice_buffer * sb, int height, int stride_line){
     cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
     cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
     cs->y = -1;
 }
 
-static void spatial_compose53i_init(dwt_compose_t *cs, IDWTELEM *buffer, int height, int stride){
+static void spatial_compose53i_init(DWTCompose *cs, IDWTELEM *buffer, int height, int stride){
     cs->b0 = buffer + mirror(-1-1, height-1)*stride;
     cs->b1 = buffer + mirror(-1  , height-1)*stride;
     cs->y = -1;
 }
 
-static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
+static void spatial_compose53i_dy_buffered(DWTCompose *cs, slice_buffer * sb, int width, int height, int stride_line){
     int y= cs->y;
 
     IDWTELEM *b0= cs->b0;
@@ -1109,7 +1111,7 @@ static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb,
     cs->y += 2;
 }
 
-static void spatial_compose53i_dy(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride){
+static void spatial_compose53i_dy(DWTCompose *cs, IDWTELEM *buffer, int width, int height, int stride){
     int y= cs->y;
     IDWTELEM *b0= cs->b0;
     IDWTELEM *b1= cs->b1;
@@ -1128,7 +1130,7 @@ static void spatial_compose53i_dy(dwt_compose_t *cs, IDWTELEM *buffer, int width
 }
 
 static void av_unused spatial_compose53i(IDWTELEM *buffer, int width, int height, int stride){
-    dwt_compose_t cs;
+    DWTCompose cs;
     spatial_compose53i_init(&cs, buffer, height, stride);
     while(cs.y <= height)
         spatial_compose53i_dy(&cs, buffer, width, height, stride);
@@ -1196,7 +1198,7 @@ void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTE
     }
 }
 
-static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
+static void spatial_compose97i_buffered_init(DWTCompose *cs, slice_buffer * sb, int height, int stride_line){
     cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
     cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
     cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
@@ -1204,7 +1206,7 @@ static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * s
     cs->y = -3;
 }
 
-static void spatial_compose97i_init(dwt_compose_t *cs, IDWTELEM *buffer, int height, int stride){
+static void spatial_compose97i_init(DWTCompose *cs, IDWTELEM *buffer, int height, int stride){
     cs->b0 = buffer + mirror(-3-1, height-1)*stride;
     cs->b1 = buffer + mirror(-3  , height-1)*stride;
     cs->b2 = buffer + mirror(-3+1, height-1)*stride;
@@ -1212,7 +1214,7 @@ static void spatial_compose97i_init(dwt_compose_t *cs, IDWTELEM *buffer, int hei
     cs->y = -3;
 }
 
-static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
+static void spatial_compose97i_dy_buffered(DSPContext *dsp, DWTCompose *cs, slice_buffer * sb, int width, int height, int stride_line){
     int y = cs->y;
 
     IDWTELEM *b0= cs->b0;
@@ -1241,7 +1243,7 @@ static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, s
     cs->y += 2;
 }
 
-static void spatial_compose97i_dy(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride){
+static void spatial_compose97i_dy(DWTCompose *cs, IDWTELEM *buffer, int width, int height, int stride){
     int y = cs->y;
     IDWTELEM *b0= cs->b0;
     IDWTELEM *b1= cs->b1;
@@ -1266,13 +1268,13 @@ static void spatial_compose97i_dy(dwt_compose_t *cs, IDWTELEM *buffer, int width
 }
 
 static void av_unused spatial_compose97i(IDWTELEM *buffer, int width, int height, int stride){
-    dwt_compose_t cs;
+    DWTCompose cs;
     spatial_compose97i_init(&cs, buffer, height, stride);
     while(cs.y <= height)
         spatial_compose97i_dy(&cs, buffer, width, height, stride);
 }
 
-static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
+static void ff_spatial_idwt_buffered_init(DWTCompose *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
     int level;
     for(level=decomposition_count-1; level>=0; level--){
         switch(type){
@@ -1282,7 +1284,7 @@ static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb,
     }
 }
 
-static void ff_spatial_idwt_init(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
+static void ff_spatial_idwt_init(DWTCompose *cs, IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
     int level;
     for(level=decomposition_count-1; level>=0; level--){
         switch(type){
@@ -1292,7 +1294,7 @@ static void ff_spatial_idwt_init(dwt_compose_t *cs, IDWTELEM *buffer, int width,
     }
 }
 
-static void ff_spatial_idwt_slice(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
+static void ff_spatial_idwt_slice(DWTCompose *cs, IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
     const int support = type==1 ? 3 : 5;
     int level;
     if(type==2) return;
@@ -1309,7 +1311,7 @@ static void ff_spatial_idwt_slice(dwt_compose_t *cs, IDWTELEM *buffer, int width
     }
 }
 
-static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
+static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, DWTCompose *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
     const int support = type==1 ? 3 : 5;
     int level;
     if(type==2) return;
@@ -1327,7 +1329,7 @@ static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, s
 }
 
 static void ff_spatial_idwt(IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
-        dwt_compose_t cs[MAX_DECOMPOSITIONS];
+        DWTCompose cs[MAX_DECOMPOSITIONS];
         int y;
         ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
         for(y=0; y<height; y+=4)
@@ -2423,7 +2425,7 @@ static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer
     BlockNode *rb= lb+1;
     uint8_t *block[4];
     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
-    uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
+    uint8_t *tmp = s->scratchbuf;
     uint8_t *ptmp;
     int x,y;
 
@@ -2785,7 +2787,7 @@ static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, con
     uint8_t *dst= s->current_picture.data[plane_index];
     uint8_t *src= s->  input_picture.data[plane_index];
     IDWTELEM *pred= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
-    uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
+    uint8_t *cur = s->scratchbuf;
     uint8_t tmp[ref_stride*(2*MB_SIZE+HTAPS_MAX-1)];
     const int b_stride = s->b_width << s->block_max_depth;
     const int b_height = s->b_height<< s->block_max_depth;
@@ -3703,6 +3705,7 @@ static av_cold int common_init(AVCodecContext *avctx){
             scale_mv_ref[i][j] = 256*(i+1)/(j+1);
 
     s->avctx->get_buffer(s->avctx, &s->mconly_picture);
+    s->scratchbuf = av_malloc(s->mconly_picture.linesize[0]*7*MB_SIZE);
 
     return 0;
 }
@@ -4004,6 +4007,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
     s->m.flags   = avctx->flags;
     s->m.bit_rate= avctx->bit_rate;
 
+    s->m.me.temp      =
     s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
     s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
     s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
@@ -4432,12 +4436,14 @@ static av_cold void common_end(SnowContext *s){
     av_freep(&s->spatial_dwt_buffer);
     av_freep(&s->spatial_idwt_buffer);
 
+    s->m.me.temp= NULL;
     av_freep(&s->m.me.scratchpad);
     av_freep(&s->m.me.map);
     av_freep(&s->m.me.score_map);
     av_freep(&s->m.obmc_scratchpad);
 
     av_freep(&s->block);
+    av_freep(&s->scratchbuf);
 
     for(i=0; i<MAX_REF_FRAMES; i++){
         av_freep(&s->ref_mvs[i]);
@@ -4544,7 +4550,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, const
         const int block_size = MB_SIZE >> s->block_max_depth;
         const int block_w    = plane_index ? block_size/2 : block_size;
         int mb_y;
-        dwt_compose_t cs[MAX_DECOMPOSITIONS];
+        DWTCompose cs[MAX_DECOMPOSITIONS];
         int yd=0, yq=0;
         int y;
         int end_y;
diff --git a/libavcodec/snow.h b/libavcodec/snow.h
index 6f0d79f..b208e5f 100644
--- a/libavcodec/snow.h
+++ b/libavcodec/snow.h
@@ -121,16 +121,16 @@ struct slice_buffer_s {
 #define W_DS 9
 #endif
 
-extern void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
-extern void ff_snow_horizontal_compose97i(IDWTELEM *b, int width);
-extern void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
+void ff_snow_horizontal_compose97i(IDWTELEM *b, int width);
+void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
 
 #ifdef CONFIG_SNOW_ENCODER
 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
 #else
-static int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {assert (0);}
-static int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {assert (0);}
+static int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {assert (0); return 0;}
+static int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {assert (0); return 0;}
 #endif
 
 /* C bits used by mmx/sse2/altivec */
diff --git a/libavcodec/sparc/dsputil_vis.c b/libavcodec/sparc/dsputil_vis.c
index a01eea3..0779395 100644
--- a/libavcodec/sparc/dsputil_vis.c
+++ b/libavcodec/sparc/dsputil_vis.c
@@ -31,9 +31,9 @@
 
 #include "vis.h"
 
-extern void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data);
-extern void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data);
-extern void ff_simple_idct_vis(DCTELEM *data);
+void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_simple_idct_vis(DCTELEM *data);
 
 /* The trick used in some of this file is the formula from the MMX
  * motion comp code, which is:
diff --git a/libavcodec/sparc/simple_idct_vis.c b/libavcodec/sparc/simple_idct_vis.c
index 1581ee1..b78bc66 100644
--- a/libavcodec/sparc/simple_idct_vis.c
+++ b/libavcodec/sparc/simple_idct_vis.c
@@ -384,7 +384,7 @@ static const DECLARE_ALIGNED_8(uint16_t, expand[4]) = {
         "st %%f14, [%12+" dest "] \n\t"\
 
 
-inline void ff_simple_idct_vis(DCTELEM *data) {
+void ff_simple_idct_vis(DCTELEM *data) {
     int out1, out2, out3, out4;
     DECLARE_ALIGNED_8(int16_t, temp[8*8]);
 
diff --git a/libavcodec/svq1.c b/libavcodec/svq1.c
index a499183..790bbb3 100644
--- a/libavcodec/svq1.c
+++ b/libavcodec/svq1.c
@@ -37,7 +37,7 @@
 #include "svq1_vlc.h"
 
 /* standard video sizes */
-const svq1_frame_size_t ff_svq1_frame_size_table[8] = {
+const struct svq1_frame_size ff_svq1_frame_size_table[8] = {
   { 160, 120 }, { 128,  96 }, { 176, 144 }, { 352, 288 },
   { 704, 576 }, { 240, 180 }, { 320, 240 }, {  -1,  -1 }
 };
diff --git a/libavcodec/svq1.h b/libavcodec/svq1.h
index a8469d9..a4b5a16 100644
--- a/libavcodec/svq1.h
+++ b/libavcodec/svq1.h
@@ -42,10 +42,10 @@
 #define SVQ1_BLOCK_INTER_4V     2
 #define SVQ1_BLOCK_INTRA        3
 
-typedef struct {
+struct svq1_frame_size {
     int width;
     int height;
-} svq1_frame_size_t;
+};
 
 uint16_t ff_svq1_packet_checksum (const uint8_t *data, const int length,
                                   int value);
@@ -59,6 +59,6 @@ extern const uint8_t ff_svq1_inter_multistage_vlc[6][8][2];
 extern const uint16_t ff_svq1_intra_mean_vlc[256][2];
 extern const uint16_t ff_svq1_inter_mean_vlc[512][2];
 
-extern const svq1_frame_size_t ff_svq1_frame_size_table[8];
+extern const struct svq1_frame_size ff_svq1_frame_size_table[8];
 
 #endif /* AVCODEC_SVQ1_H */
diff --git a/libavcodec/svq1dec.c b/libavcodec/svq1dec.c
index 8f399ab..d306149 100644
--- a/libavcodec/svq1dec.c
+++ b/libavcodec/svq1dec.c
@@ -56,7 +56,7 @@ static VLC svq1_inter_mean;
 typedef struct svq1_pmv_s {
   int           x;
   int           y;
-} svq1_pmv_t;
+} svq1_pmv;
 
 static const uint16_t checksum_table[256] = {
   0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
@@ -301,7 +301,7 @@ static int svq1_decode_block_non_intra (GetBitContext *bitbuf, uint8_t *pixels,
   return 0;
 }
 
-static int svq1_decode_motion_vector (GetBitContext *bitbuf, svq1_pmv_t *mv, svq1_pmv_t **pmv) {
+static int svq1_decode_motion_vector (GetBitContext *bitbuf, svq1_pmv *mv, svq1_pmv **pmv) {
   int        diff;
   int        i;
 
@@ -342,11 +342,11 @@ static void svq1_skip_block (uint8_t *current, uint8_t *previous, int pitch, int
 
 static int svq1_motion_inter_block (MpegEncContext *s, GetBitContext *bitbuf,
                                uint8_t *current, uint8_t *previous, int pitch,
-                               svq1_pmv_t *motion, int x, int y) {
+                               svq1_pmv *motion, int x, int y) {
   uint8_t    *src;
   uint8_t    *dst;
-  svq1_pmv_t  mv;
-  svq1_pmv_t *pmv[3];
+  svq1_pmv    mv;
+  svq1_pmv   *pmv[3];
   int         result;
 
   /* predict and decode motion vector */
@@ -394,11 +394,11 @@ static int svq1_motion_inter_block (MpegEncContext *s, GetBitContext *bitbuf,
 
 static int svq1_motion_inter_4v_block (MpegEncContext *s, GetBitContext *bitbuf,
                                   uint8_t *current, uint8_t *previous, int pitch,
-                                  svq1_pmv_t *motion,int x, int y) {
+                                  svq1_pmv *motion,int x, int y) {
   uint8_t    *src;
   uint8_t    *dst;
-  svq1_pmv_t  mv;
-  svq1_pmv_t *pmv[4];
+  svq1_pmv    mv;
+  svq1_pmv   *pmv[4];
   int         i, result;
 
   /* predict and decode motion vector (0) */
@@ -484,7 +484,7 @@ static int svq1_motion_inter_4v_block (MpegEncContext *s, GetBitContext *bitbuf,
 
 static int svq1_decode_delta_block (MpegEncContext *s, GetBitContext *bitbuf,
                         uint8_t *current, uint8_t *previous, int pitch,
-                        svq1_pmv_t *motion, int x, int y) {
+                        svq1_pmv *motion, int x, int y) {
   uint32_t block_type;
   int      result = 0;
 
@@ -727,9 +727,9 @@ static int svq1_decode_frame(AVCodecContext *avctx,
         current += 16*linesize;
       }
     } else {
-      svq1_pmv_t pmv[width/8+3];
+      svq1_pmv pmv[width/8+3];
       /* delta frame */
-      memset (pmv, 0, ((width / 8) + 3) * sizeof(svq1_pmv_t));
+      memset (pmv, 0, ((width / 8) + 3) * sizeof(svq1_pmv));
 
       for (y=0; y < height; y+=16) {
         for (x=0; x < width; x+=16) {
diff --git a/libavcodec/svq1enc.c b/libavcodec/svq1enc.c
index 8d0bca5..49ad3d3 100644
--- a/libavcodec/svq1enc.c
+++ b/libavcodec/svq1enc.c
@@ -67,6 +67,8 @@ typedef struct SVQ1Context {
     int16_t (*motion_val16[3])[2];
 
     int64_t rd_total;
+
+    uint8_t *scratchbuf;
 } SVQ1Context;
 
 static void svq1_write_header(SVQ1Context *s, int frame_type)
@@ -378,7 +380,7 @@ static int svq1_encode_plane(SVQ1Context *s, int plane, unsigned char *src_plane
             uint8_t *decoded= decoded_plane + offset;
             uint8_t *ref= ref_plane + offset;
             int score[4]={0,0,0,0}, best;
-            uint8_t temp[16*stride];
+            uint8_t *temp = s->scratchbuf;
 
             if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < 3000){ //FIXME check size
                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
@@ -524,6 +526,7 @@ static int svq1_encode_frame(AVCodecContext *avctx, unsigned char *buf,
     if(!s->current_picture.data[0]){
         avctx->get_buffer(avctx, &s->current_picture);
         avctx->get_buffer(avctx, &s->last_picture);
+        s->scratchbuf = av_malloc(s->current_picture.linesize[0] * 16);
     }
 
     temp= s->current_picture;
@@ -566,6 +569,7 @@ static av_cold int svq1_encode_end(AVCodecContext *avctx)
     av_freep(&s->m.me.score_map);
     av_freep(&s->mb_type);
     av_freep(&s->dummy);
+    av_freep(&s->scratchbuf);
 
     for(i=0; i<3; i++){
         av_freep(&s->motion_val8[i]);
diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index 1da6802..f438ba8 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -37,7 +37,7 @@
  *
  * You will know you have these parameters passed correctly when the decoder
  * correctly decodes this file:
- *  ftp://ftp.mplayerhq.hu/MPlayer/samples/V-codecs/SVQ3/Vertical400kbit.sorenson3.mov
+ *  http://samples.mplayerhq.hu/V-codecs/SVQ3/Vertical400kbit.sorenson3.mov
  */
 
 #ifdef CONFIG_ZLIB
@@ -65,93 +65,96 @@
    /
  o-->o-->o-->o
 */
-static const uint8_t svq3_scan[16]={
- 0+0*4, 1+0*4, 2+0*4, 2+1*4,
- 2+2*4, 3+0*4, 3+1*4, 3+2*4,
- 0+1*4, 0+2*4, 1+1*4, 1+2*4,
- 0+3*4, 1+3*4, 2+3*4, 3+3*4,
+static const uint8_t svq3_scan[16] = {
+    0+0*4, 1+0*4, 2+0*4, 2+1*4,
+    2+2*4, 3+0*4, 3+1*4, 3+2*4,
+    0+1*4, 0+2*4, 1+1*4, 1+2*4,
+    0+3*4, 1+3*4, 2+3*4, 3+3*4,
 };
 
 static const uint8_t svq3_pred_0[25][2] = {
-  { 0, 0 },
-  { 1, 0 }, { 0, 1 },
-  { 0, 2 }, { 1, 1 }, { 2, 0 },
-  { 3, 0 }, { 2, 1 }, { 1, 2 }, { 0, 3 },
-  { 0, 4 }, { 1, 3 }, { 2, 2 }, { 3, 1 }, { 4, 0 },
-  { 4, 1 }, { 3, 2 }, { 2, 3 }, { 1, 4 },
-  { 2, 4 }, { 3, 3 }, { 4, 2 },
-  { 4, 3 }, { 3, 4 },
-  { 4, 4 }
+    { 0, 0 },
+    { 1, 0 }, { 0, 1 },
+    { 0, 2 }, { 1, 1 }, { 2, 0 },
+    { 3, 0 }, { 2, 1 }, { 1, 2 }, { 0, 3 },
+    { 0, 4 }, { 1, 3 }, { 2, 2 }, { 3, 1 }, { 4, 0 },
+    { 4, 1 }, { 3, 2 }, { 2, 3 }, { 1, 4 },
+    { 2, 4 }, { 3, 3 }, { 4, 2 },
+    { 4, 3 }, { 3, 4 },
+    { 4, 4 }
 };
 
 static const int8_t svq3_pred_1[6][6][5] = {
-  { { 2,-1,-1,-1,-1 }, { 2, 1,-1,-1,-1 }, { 1, 2,-1,-1,-1 },
-    { 2, 1,-1,-1,-1 }, { 1, 2,-1,-1,-1 }, { 1, 2,-1,-1,-1 } },
-  { { 0, 2,-1,-1,-1 }, { 0, 2, 1, 4, 3 }, { 0, 1, 2, 4, 3 },
-    { 0, 2, 1, 4, 3 }, { 2, 0, 1, 3, 4 }, { 0, 4, 2, 1, 3 } },
-  { { 2, 0,-1,-1,-1 }, { 2, 1, 0, 4, 3 }, { 1, 2, 4, 0, 3 },
-    { 2, 1, 0, 4, 3 }, { 2, 1, 4, 3, 0 }, { 1, 2, 4, 0, 3 } },
-  { { 2, 0,-1,-1,-1 }, { 2, 0, 1, 4, 3 }, { 1, 2, 0, 4, 3 },
-    { 2, 1, 0, 4, 3 }, { 2, 1, 3, 4, 0 }, { 2, 4, 1, 0, 3 } },
-  { { 0, 2,-1,-1,-1 }, { 0, 2, 1, 3, 4 }, { 1, 2, 3, 0, 4 },
-    { 2, 0, 1, 3, 4 }, { 2, 1, 3, 0, 4 }, { 2, 0, 4, 3, 1 } },
-  { { 0, 2,-1,-1,-1 }, { 0, 2, 4, 1, 3 }, { 1, 4, 2, 0, 3 },
-    { 4, 2, 0, 1, 3 }, { 2, 0, 1, 4, 3 }, { 4, 2, 1, 0, 3 } },
+    { { 2,-1,-1,-1,-1 }, { 2, 1,-1,-1,-1 }, { 1, 2,-1,-1,-1 },
+      { 2, 1,-1,-1,-1 }, { 1, 2,-1,-1,-1 }, { 1, 2,-1,-1,-1 } },
+    { { 0, 2,-1,-1,-1 }, { 0, 2, 1, 4, 3 }, { 0, 1, 2, 4, 3 },
+      { 0, 2, 1, 4, 3 }, { 2, 0, 1, 3, 4 }, { 0, 4, 2, 1, 3 } },
+    { { 2, 0,-1,-1,-1 }, { 2, 1, 0, 4, 3 }, { 1, 2, 4, 0, 3 },
+      { 2, 1, 0, 4, 3 }, { 2, 1, 4, 3, 0 }, { 1, 2, 4, 0, 3 } },
+    { { 2, 0,-1,-1,-1 }, { 2, 0, 1, 4, 3 }, { 1, 2, 0, 4, 3 },
+      { 2, 1, 0, 4, 3 }, { 2, 1, 3, 4, 0 }, { 2, 4, 1, 0, 3 } },
+    { { 0, 2,-1,-1,-1 }, { 0, 2, 1, 3, 4 }, { 1, 2, 3, 0, 4 },
+      { 2, 0, 1, 3, 4 }, { 2, 1, 3, 0, 4 }, { 2, 0, 4, 3, 1 } },
+    { { 0, 2,-1,-1,-1 }, { 0, 2, 4, 1, 3 }, { 1, 4, 2, 0, 3 },
+      { 4, 2, 0, 1, 3 }, { 2, 0, 1, 4, 3 }, { 4, 2, 1, 0, 3 } },
 };
 
 static const struct { uint8_t run; uint8_t level; } svq3_dct_tables[2][16] = {
-  { { 0, 0 }, { 0, 1 }, { 1, 1 }, { 2, 1 }, { 0, 2 }, { 3, 1 }, { 4, 1 }, { 5, 1 },
-    { 0, 3 }, { 1, 2 }, { 2, 2 }, { 6, 1 }, { 7, 1 }, { 8, 1 }, { 9, 1 }, { 0, 4 } },
-  { { 0, 0 }, { 0, 1 }, { 1, 1 }, { 0, 2 }, { 2, 1 }, { 0, 3 }, { 0, 4 }, { 0, 5 },
-    { 3, 1 }, { 4, 1 }, { 1, 2 }, { 1, 3 }, { 0, 6 }, { 0, 7 }, { 0, 8 }, { 0, 9 } }
+    { { 0, 0 }, { 0, 1 }, { 1, 1 }, { 2, 1 }, { 0, 2 }, { 3, 1 }, { 4, 1 }, { 5, 1 },
+      { 0, 3 }, { 1, 2 }, { 2, 2 }, { 6, 1 }, { 7, 1 }, { 8, 1 }, { 9, 1 }, { 0, 4 } },
+    { { 0, 0 }, { 0, 1 }, { 1, 1 }, { 0, 2 }, { 2, 1 }, { 0, 3 }, { 0, 4 }, { 0, 5 },
+      { 3, 1 }, { 4, 1 }, { 1, 2 }, { 1, 3 }, { 0, 6 }, { 0, 7 }, { 0, 8 }, { 0, 9 } }
 };
 
 static const uint32_t svq3_dequant_coeff[32] = {
-   3881,  4351,  4890,  5481,  6154,  6914,  7761,  8718,
-   9781, 10987, 12339, 13828, 15523, 17435, 19561, 21873,
-  24552, 27656, 30847, 34870, 38807, 43747, 49103, 54683,
-  61694, 68745, 77615, 89113,100253,109366,126635,141533
+     3881,  4351,  4890,  5481,  6154,  6914,  7761,  8718,
+     9781, 10987, 12339, 13828, 15523, 17435, 19561, 21873,
+    24552, 27656, 30847, 34870, 38807, 43747, 49103, 54683,
+    61694, 68745, 77615, 89113,100253,109366,126635,141533
 };
 
 
-static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp){
-    const int qmul= svq3_dequant_coeff[qp];
+static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
+{
+    const int qmul = svq3_dequant_coeff[qp];
 #define stride 16
     int i;
     int temp[16];
-    static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
-    static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
-
-    for(i=0; i<4; i++){
-        const int offset= y_offset[i];
-        const int z0= 13*(block[offset+stride*0] +    block[offset+stride*4]);
-        const int z1= 13*(block[offset+stride*0] -    block[offset+stride*4]);
-        const int z2=  7* block[offset+stride*1] - 17*block[offset+stride*5];
-        const int z3= 17* block[offset+stride*1] +  7*block[offset+stride*5];
-
-        temp[4*i+0]= z0+z3;
-        temp[4*i+1]= z1+z2;
-        temp[4*i+2]= z1-z2;
-        temp[4*i+3]= z0-z3;
+    static const int x_offset[4] = {0, 1*stride, 4* stride,  5*stride};
+    static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride};
+
+    for (i = 0; i < 4; i++){
+        const int offset = y_offset[i];
+        const int z0 = 13*(block[offset+stride*0] +    block[offset+stride*4]);
+        const int z1 = 13*(block[offset+stride*0] -    block[offset+stride*4]);
+        const int z2 =  7* block[offset+stride*1] - 17*block[offset+stride*5];
+        const int z3 = 17* block[offset+stride*1] +  7*block[offset+stride*5];
+
+        temp[4*i+0] = z0+z3;
+        temp[4*i+1] = z1+z2;
+        temp[4*i+2] = z1-z2;
+        temp[4*i+3] = z0-z3;
     }
 
-    for(i=0; i<4; i++){
-        const int offset= x_offset[i];
-        const int z0= 13*(temp[4*0+i] +    temp[4*2+i]);
-        const int z1= 13*(temp[4*0+i] -    temp[4*2+i]);
-        const int z2=  7* temp[4*1+i] - 17*temp[4*3+i];
-        const int z3= 17* temp[4*1+i] +  7*temp[4*3+i];
-
-        block[stride*0 +offset]= ((z0 + z3)*qmul + 0x80000)>>20;
-        block[stride*2 +offset]= ((z1 + z2)*qmul + 0x80000)>>20;
-        block[stride*8 +offset]= ((z1 - z2)*qmul + 0x80000)>>20;
-        block[stride*10+offset]= ((z0 - z3)*qmul + 0x80000)>>20;
+    for (i = 0; i < 4; i++){
+        const int offset = x_offset[i];
+        const int z0 = 13*(temp[4*0+i] +    temp[4*2+i]);
+        const int z1 = 13*(temp[4*0+i] -    temp[4*2+i]);
+        const int z2 =  7* temp[4*1+i] - 17*temp[4*3+i];
+        const int z3 = 17* temp[4*1+i] +  7*temp[4*3+i];
+
+        block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
+        block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
+        block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
+        block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
     }
 }
 #undef stride
 
-static void svq3_add_idct_c (uint8_t *dst, DCTELEM *block, int stride, int qp, int dc){
-    const int qmul= svq3_dequant_coeff[qp];
+static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp,
+                            int dc)
+{
+    const int qmul = svq3_dequant_coeff[qp];
     int i;
     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 
@@ -160,859 +163,879 @@ static void svq3_add_idct_c (uint8_t *dst, DCTELEM *block, int stride, int qp, i
         block[0] = 0;
     }
 
-    for (i=0; i < 4; i++) {
-        const int z0= 13*(block[0 + 4*i] +    block[2 + 4*i]);
-        const int z1= 13*(block[0 + 4*i] -    block[2 + 4*i]);
-        const int z2=  7* block[1 + 4*i] - 17*block[3 + 4*i];
-        const int z3= 17* block[1 + 4*i] +  7*block[3 + 4*i];
+    for (i = 0; i < 4; i++) {
+        const int z0 = 13*(block[0 + 4*i] +    block[2 + 4*i]);
+        const int z1 = 13*(block[0 + 4*i] -    block[2 + 4*i]);
+        const int z2 =  7* block[1 + 4*i] - 17*block[3 + 4*i];
+        const int z3 = 17* block[1 + 4*i] +  7*block[3 + 4*i];
 
-        block[0 + 4*i]= z0 + z3;
-        block[1 + 4*i]= z1 + z2;
-        block[2 + 4*i]= z1 - z2;
-        block[3 + 4*i]= z0 - z3;
+        block[0 + 4*i] = z0 + z3;
+        block[1 + 4*i] = z1 + z2;
+        block[2 + 4*i] = z1 - z2;
+        block[3 + 4*i] = z0 - z3;
     }
 
-    for (i=0; i < 4; i++) {
-        const int z0= 13*(block[i + 4*0] +    block[i + 4*2]);
-        const int z1= 13*(block[i + 4*0] -    block[i + 4*2]);
-        const int z2=  7* block[i + 4*1] - 17*block[i + 4*3];
-        const int z3= 17* block[i + 4*1] +  7*block[i + 4*3];
-        const int rr= (dc + 0x80000);
-
-        dst[i + stride*0]= cm[ dst[i + stride*0] + (((z0 + z3)*qmul + rr) >> 20) ];
-        dst[i + stride*1]= cm[ dst[i + stride*1] + (((z1 + z2)*qmul + rr) >> 20) ];
-        dst[i + stride*2]= cm[ dst[i + stride*2] + (((z1 - z2)*qmul + rr) >> 20) ];
-        dst[i + stride*3]= cm[ dst[i + stride*3] + (((z0 - z3)*qmul + rr) >> 20) ];
+    for (i = 0; i < 4; i++) {
+        const int z0 = 13*(block[i + 4*0] +    block[i + 4*2]);
+        const int z1 = 13*(block[i + 4*0] -    block[i + 4*2]);
+        const int z2 =  7* block[i + 4*1] - 17*block[i + 4*3];
+        const int z3 = 17* block[i + 4*1] +  7*block[i + 4*3];
+        const int rr = (dc + 0x80000);
+
+        dst[i + stride*0] = cm[ dst[i + stride*0] + (((z0 + z3)*qmul + rr) >> 20) ];
+        dst[i + stride*1] = cm[ dst[i + stride*1] + (((z1 + z2)*qmul + rr) >> 20) ];
+        dst[i + stride*2] = cm[ dst[i + stride*2] + (((z1 - z2)*qmul + rr) >> 20) ];
+        dst[i + stride*3] = cm[ dst[i + stride*3] + (((z0 - z3)*qmul + rr) >> 20) ];
     }
 }
 
-static inline int svq3_decode_block (GetBitContext *gb, DCTELEM *block,
-                                     int index, const int type) {
+static inline int svq3_decode_block(GetBitContext *gb, DCTELEM *block,
+                                    int index, const int type)
+{
+    static const uint8_t *const scan_patterns[4] =
+    { luma_dc_zigzag_scan, zigzag_scan, svq3_scan, chroma_dc_scan };
 
-  static const uint8_t *const scan_patterns[4] =
-  { luma_dc_zigzag_scan, zigzag_scan, svq3_scan, chroma_dc_scan };
+    int run, level, sign, vlc, limit;
+    const int intra = (3 * type) >> 2;
+    const uint8_t *const scan = scan_patterns[type];
 
-  int run, level, sign, vlc, limit;
-  const int intra = (3 * type) >> 2;
-  const uint8_t *const scan = scan_patterns[type];
+    for (limit = (16 >> intra); index < 16; index = limit, limit += 8) {
+        for (; (vlc = svq3_get_ue_golomb(gb)) != 0; index++) {
 
-  for (limit=(16 >> intra); index < 16; index=limit, limit+=8) {
-    for (; (vlc = svq3_get_ue_golomb (gb)) != 0; index++) {
+          if (vlc == INVALID_VLC)
+              return -1;
 
-      if (vlc == INVALID_VLC)
-        return -1;
+          sign = (vlc & 0x1) - 1;
+          vlc  = (vlc + 1) >> 1;
+
+          if (type == 3) {
+              if (vlc < 3) {
+                  run   = 0;
+                  level = vlc;
+              } else if (vlc < 4) {
+                  run   = 1;
+                  level = 1;
+              } else {
+                  run   = (vlc & 0x3);
+                  level = ((vlc + 9) >> 2) - run;
+              }
+          } else {
+              if (vlc < 16) {
+                  run   = svq3_dct_tables[intra][vlc].run;
+                  level = svq3_dct_tables[intra][vlc].level;
+              } else if (intra) {
+                  run   = (vlc & 0x7);
+                  level = (vlc >> 3) + ((run == 0) ? 8 : ((run < 2) ? 2 : ((run < 5) ? 0 : -1)));
+              } else {
+                  run   = (vlc & 0xF);
+                  level = (vlc >> 4) + ((run == 0) ? 4 : ((run < 3) ? 2 : ((run < 10) ? 1 : 0)));
+              }
+          }
 
-      sign = (vlc & 0x1) - 1;
-      vlc  = (vlc + 1) >> 1;
+          if ((index += run) >= limit)
+              return -1;
 
-      if (type == 3) {
-        if (vlc < 3) {
-          run   = 0;
-          level = vlc;
-        } else if (vlc < 4) {
-          run   = 1;
-          level = 1;
-        } else {
-          run   = (vlc & 0x3);
-          level = ((vlc + 9) >> 2) - run;
+          block[scan[index]] = (level ^ sign) - sign;
         }
-      } else {
-        if (vlc < 16) {
-          run   = svq3_dct_tables[intra][vlc].run;
-          level = svq3_dct_tables[intra][vlc].level;
-        } else if (intra) {
-          run   = (vlc & 0x7);
-          level = (vlc >> 3) + ((run == 0) ? 8 : ((run < 2) ? 2 : ((run < 5) ? 0 : -1)));
-        } else {
-          run   = (vlc & 0xF);
-          level = (vlc >> 4) + ((run == 0) ? 4 : ((run < 3) ? 2 : ((run < 10) ? 1 : 0)));
-        }
-      }
-
-      if ((index += run) >= limit)
-        return -1;
 
-      block[scan[index]] = (level ^ sign) - sign;
-    }
-
-    if (type != 2) {
-      break;
+        if (type != 2) {
+            break;
+        }
     }
-  }
 
-  return 0;
+    return 0;
 }
 
-static inline void svq3_mc_dir_part (MpegEncContext *s,
-                                     int x, int y, int width, int height,
-                                     int mx, int my, int dxy,
-                                     int thirdpel, int dir, int avg) {
+static inline void svq3_mc_dir_part(MpegEncContext *s,
+                                    int x, int y, int width, int height,
+                                    int mx, int my, int dxy,
+                                    int thirdpel, int dir, int avg)
+{
+    const Picture *pic = (dir == 0) ? &s->last_picture : &s->next_picture;
+    uint8_t *src, *dest;
+    int i, emu = 0;
+    int blocksize = 2 - (width>>3); //16->0, 8->1, 4->2
 
-  const Picture *pic = (dir == 0) ? &s->last_picture : &s->next_picture;
-  uint8_t *src, *dest;
-  int i, emu = 0;
-  int blocksize= 2 - (width>>3); //16->0, 8->1, 4->2
+    mx += x;
+    my += y;
 
-  mx += x;
-  my += y;
+    if (mx < 0 || mx >= (s->h_edge_pos - width  - 1) ||
+        my < 0 || my >= (s->v_edge_pos - height - 1)) {
 
-  if (mx < 0 || mx >= (s->h_edge_pos - width  - 1) ||
-      my < 0 || my >= (s->v_edge_pos - height - 1)) {
+        if ((s->flags & CODEC_FLAG_EMU_EDGE)) {
+            emu = 1;
+        }
 
-    if ((s->flags & CODEC_FLAG_EMU_EDGE)) {
-      emu = 1;
+        mx = av_clip (mx, -16, (s->h_edge_pos - width  + 15));
+        my = av_clip (my, -16, (s->v_edge_pos - height + 15));
     }
 
-    mx = av_clip (mx, -16, (s->h_edge_pos - width  + 15));
-    my = av_clip (my, -16, (s->v_edge_pos - height + 15));
-  }
-
-  /* form component predictions */
-  dest = s->current_picture.data[0] + x + y*s->linesize;
-  src  = pic->data[0] + mx + my*s->linesize;
-
-  if (emu) {
-    ff_emulated_edge_mc (s->edge_emu_buffer, src, s->linesize, (width + 1), (height + 1),
-                         mx, my, s->h_edge_pos, s->v_edge_pos);
-    src = s->edge_emu_buffer;
-  }
-  if(thirdpel)
-    (avg ? s->dsp.avg_tpel_pixels_tab : s->dsp.put_tpel_pixels_tab)[dxy](dest, src, s->linesize, width, height);
-  else
-    (avg ? s->dsp.avg_pixels_tab : s->dsp.put_pixels_tab)[blocksize][dxy](dest, src, s->linesize, height);
-
-  if (!(s->flags & CODEC_FLAG_GRAY)) {
-    mx     = (mx + (mx < (int) x)) >> 1;
-    my     = (my + (my < (int) y)) >> 1;
-    width  = (width  >> 1);
-    height = (height >> 1);
-    blocksize++;
-
-    for (i=1; i < 3; i++) {
-      dest = s->current_picture.data[i] + (x >> 1) + (y >> 1)*s->uvlinesize;
-      src  = pic->data[i] + mx + my*s->uvlinesize;
-
-      if (emu) {
-        ff_emulated_edge_mc (s->edge_emu_buffer, src, s->uvlinesize, (width + 1), (height + 1),
-                             mx, my, (s->h_edge_pos >> 1), (s->v_edge_pos >> 1));
+    /* form component predictions */
+    dest = s->current_picture.data[0] + x + y*s->linesize;
+    src  = pic->data[0] + mx + my*s->linesize;
+
+    if (emu) {
+        ff_emulated_edge_mc(s->edge_emu_buffer, src, s->linesize, (width + 1), (height + 1),
+                            mx, my, s->h_edge_pos, s->v_edge_pos);
         src = s->edge_emu_buffer;
-      }
-      if(thirdpel)
-        (avg ? s->dsp.avg_tpel_pixels_tab : s->dsp.put_tpel_pixels_tab)[dxy](dest, src, s->uvlinesize, width, height);
-      else
-        (avg ? s->dsp.avg_pixels_tab : s->dsp.put_pixels_tab)[blocksize][dxy](dest, src, s->uvlinesize, height);
     }
-  }
+    if (thirdpel)
+        (avg ? s->dsp.avg_tpel_pixels_tab : s->dsp.put_tpel_pixels_tab)[dxy](dest, src, s->linesize, width, height);
+    else
+        (avg ? s->dsp.avg_pixels_tab : s->dsp.put_pixels_tab)[blocksize][dxy](dest, src, s->linesize, height);
+
+    if (!(s->flags & CODEC_FLAG_GRAY)) {
+        mx     = (mx + (mx < (int) x)) >> 1;
+        my     = (my + (my < (int) y)) >> 1;
+        width  = (width  >> 1);
+        height = (height >> 1);
+        blocksize++;
+
+        for (i = 1; i < 3; i++) {
+            dest = s->current_picture.data[i] + (x >> 1) + (y >> 1)*s->uvlinesize;
+            src  = pic->data[i] + mx + my*s->uvlinesize;
+
+            if (emu) {
+                ff_emulated_edge_mc(s->edge_emu_buffer, src, s->uvlinesize, (width + 1), (height + 1),
+                                    mx, my, (s->h_edge_pos >> 1), (s->v_edge_pos >> 1));
+                src = s->edge_emu_buffer;
+            }
+            if (thirdpel)
+                (avg ? s->dsp.avg_tpel_pixels_tab : s->dsp.put_tpel_pixels_tab)[dxy](dest, src, s->uvlinesize, width, height);
+            else
+                (avg ? s->dsp.avg_pixels_tab : s->dsp.put_pixels_tab)[blocksize][dxy](dest, src, s->uvlinesize, height);
+        }
+    }
 }
 
-static inline int svq3_mc_dir (H264Context *h, int size, int mode, int dir, int avg) {
-
-  int i, j, k, mx, my, dx, dy, x, y;
-  MpegEncContext *const s = (MpegEncContext *) h;
-  const int part_width  = ((size & 5) == 4) ? 4 : 16 >> (size & 1);
-  const int part_height = 16 >> ((unsigned) (size + 1) / 3);
-  const int extra_width = (mode == PREDICT_MODE) ? -16*6 : 0;
-  const int h_edge_pos  = 6*(s->h_edge_pos - part_width ) - extra_width;
-  const int v_edge_pos  = 6*(s->v_edge_pos - part_height) - extra_width;
-
-  for (i=0; i < 16; i+=part_height) {
-    for (j=0; j < 16; j+=part_width) {
-      const int b_xy = (4*s->mb_x+(j>>2)) + (4*s->mb_y+(i>>2))*h->b_stride;
-      int dxy;
-      x = 16*s->mb_x + j;
-      y = 16*s->mb_y + i;
-      k = ((j>>2)&1) + ((i>>1)&2) + ((j>>1)&4) + (i&8);
-
-      if (mode != PREDICT_MODE) {
-        pred_motion (h, k, (part_width >> 2), dir, 1, &mx, &my);
-      } else {
-        mx = s->next_picture.motion_val[0][b_xy][0]<<1;
-        my = s->next_picture.motion_val[0][b_xy][1]<<1;
-
-        if (dir == 0) {
-          mx = ((mx * h->frame_num_offset) / h->prev_frame_num_offset + 1)>>1;
-          my = ((my * h->frame_num_offset) / h->prev_frame_num_offset + 1)>>1;
-        } else {
-          mx = ((mx * (h->frame_num_offset - h->prev_frame_num_offset)) / h->prev_frame_num_offset + 1)>>1;
-          my = ((my * (h->frame_num_offset - h->prev_frame_num_offset)) / h->prev_frame_num_offset + 1)>>1;
+static inline int svq3_mc_dir(H264Context *h, int size, int mode, int dir,
+                              int avg)
+{
+    int i, j, k, mx, my, dx, dy, x, y;
+    MpegEncContext *const s = (MpegEncContext *) h;
+    const int part_width  = ((size & 5) == 4) ? 4 : 16 >> (size & 1);
+    const int part_height = 16 >> ((unsigned) (size + 1) / 3);
+    const int extra_width = (mode == PREDICT_MODE) ? -16*6 : 0;
+    const int h_edge_pos  = 6*(s->h_edge_pos - part_width ) - extra_width;
+    const int v_edge_pos  = 6*(s->v_edge_pos - part_height) - extra_width;
+
+    for (i = 0; i < 16; i += part_height) {
+        for (j = 0; j < 16; j += part_width) {
+            const int b_xy = (4*s->mb_x + (j >> 2)) + (4*s->mb_y + (i >> 2))*h->b_stride;
+            int dxy;
+            x = 16*s->mb_x + j;
+            y = 16*s->mb_y + i;
+            k = ((j >> 2) & 1) + ((i >> 1) & 2) + ((j >> 1) & 4) + (i & 8);
+
+            if (mode != PREDICT_MODE) {
+                pred_motion(h, k, (part_width >> 2), dir, 1, &mx, &my);
+            } else {
+                mx = s->next_picture.motion_val[0][b_xy][0]<<1;
+                my = s->next_picture.motion_val[0][b_xy][1]<<1;
+
+                if (dir == 0) {
+                    mx = ((mx * h->frame_num_offset) / h->prev_frame_num_offset + 1) >> 1;
+                    my = ((my * h->frame_num_offset) / h->prev_frame_num_offset + 1) >> 1;
+                } else {
+                    mx = ((mx * (h->frame_num_offset - h->prev_frame_num_offset)) / h->prev_frame_num_offset + 1) >> 1;
+                    my = ((my * (h->frame_num_offset - h->prev_frame_num_offset)) / h->prev_frame_num_offset + 1) >> 1;
+                }
+            }
+
+            /* clip motion vector prediction to frame border */
+            mx = av_clip(mx, extra_width - 6*x, h_edge_pos - 6*x);
+            my = av_clip(my, extra_width - 6*y, v_edge_pos - 6*y);
+
+            /* get (optional) motion vector differential */
+            if (mode == PREDICT_MODE) {
+                dx = dy = 0;
+            } else {
+                dy = svq3_get_se_golomb(&s->gb);
+                dx = svq3_get_se_golomb(&s->gb);
+
+                if (dx == INVALID_VLC || dy == INVALID_VLC) {
+                    av_log(h->s.avctx, AV_LOG_ERROR, "invalid MV vlc\n");
+                    return -1;
+                }
+            }
+
+            /* compute motion vector */
+            if (mode == THIRDPEL_MODE) {
+                int fx, fy;
+                mx  = ((mx + 1)>>1) + dx;
+                my  = ((my + 1)>>1) + dy;
+                fx  = ((unsigned)(mx + 0x3000))/3 - 0x1000;
+                fy  = ((unsigned)(my + 0x3000))/3 - 0x1000;
+                dxy = (mx - 3*fx) + 4*(my - 3*fy);
+
+                svq3_mc_dir_part(s, x, y, part_width, part_height, fx, fy, dxy, 1, dir, avg);
+                mx += mx;
+                my += my;
+            } else if (mode == HALFPEL_MODE || mode == PREDICT_MODE) {
+                mx  = ((unsigned)(mx + 1 + 0x3000))/3 + dx - 0x1000;
+                my  = ((unsigned)(my + 1 + 0x3000))/3 + dy - 0x1000;
+                dxy = (mx&1) + 2*(my&1);
+
+                svq3_mc_dir_part(s, x, y, part_width, part_height, mx>>1, my>>1, dxy, 0, dir, avg);
+                mx *= 3;
+                my *= 3;
+            } else {
+                mx = ((unsigned)(mx + 3 + 0x6000))/6 + dx - 0x1000;
+                my = ((unsigned)(my + 3 + 0x6000))/6 + dy - 0x1000;
+
+                svq3_mc_dir_part(s, x, y, part_width, part_height, mx, my, 0, 0, dir, avg);
+                mx *= 6;
+                my *= 6;
+            }
+
+            /* update mv_cache */
+            if (mode != PREDICT_MODE) {
+                int32_t mv = pack16to32(mx,my);
+
+                if (part_height == 8 && i < 8) {
+                    *(int32_t *) h->mv_cache[dir][scan8[k] + 1*8] = mv;
+
+                    if (part_width == 8 && j < 8) {
+                        *(int32_t *) h->mv_cache[dir][scan8[k] + 1 + 1*8] = mv;
+                    }
+                }
+                if (part_width == 8 && j < 8) {
+                    *(int32_t *) h->mv_cache[dir][scan8[k] + 1] = mv;
+                }
+                if (part_width == 4 || part_height == 4) {
+                    *(int32_t *) h->mv_cache[dir][scan8[k]] = mv;
+                }
+            }
+
+            /* write back motion vectors */
+            fill_rectangle(s->current_picture.motion_val[dir][b_xy], part_width>>2, part_height>>2, h->b_stride, pack16to32(mx,my), 4);
         }
-      }
-
-      /* clip motion vector prediction to frame border */
-      mx = av_clip (mx, extra_width - 6*x, h_edge_pos - 6*x);
-      my = av_clip (my, extra_width - 6*y, v_edge_pos - 6*y);
-
-      /* get (optional) motion vector differential */
-      if (mode == PREDICT_MODE) {
-        dx = dy = 0;
-      } else {
-        dy = svq3_get_se_golomb (&s->gb);
-        dx = svq3_get_se_golomb (&s->gb);
-
-        if (dx == INVALID_VLC || dy == INVALID_VLC) {
-          av_log(h->s.avctx, AV_LOG_ERROR, "invalid MV vlc\n");
-          return -1;
+    }
+
+    return 0;
+}
+
+static int svq3_decode_mb(H264Context *h, unsigned int mb_type)
+{
+    int i, j, k, m, dir, mode;
+    int cbp = 0;
+    uint32_t vlc;
+    int8_t *top, *left;
+    MpegEncContext *const s = (MpegEncContext *) h;
+    const int mb_xy = h->mb_xy;
+    const int b_xy  = 4*s->mb_x + 4*s->mb_y*h->b_stride;
+
+    h->top_samples_available      = (s->mb_y == 0) ? 0x33FF : 0xFFFF;
+    h->left_samples_available     = (s->mb_x == 0) ? 0x5F5F : 0xFFFF;
+    h->topright_samples_available = 0xFFFF;
+
+    if (mb_type == 0) {           /* SKIP */
+        if (s->pict_type == FF_P_TYPE || s->next_picture.mb_type[mb_xy] == -1) {
+            svq3_mc_dir_part(s, 16*s->mb_x, 16*s->mb_y, 16, 16, 0, 0, 0, 0, 0, 0);
+
+            if (s->pict_type == FF_B_TYPE) {
+                svq3_mc_dir_part(s, 16*s->mb_x, 16*s->mb_y, 16, 16, 0, 0, 0, 0, 1, 1);
+            }
+
+            mb_type = MB_TYPE_SKIP;
+        } else {
+            mb_type = FFMIN(s->next_picture.mb_type[mb_xy], 6);
+            if (svq3_mc_dir(h, mb_type, PREDICT_MODE, 0, 0) < 0)
+                return -1;
+            if (svq3_mc_dir(h, mb_type, PREDICT_MODE, 1, 1) < 0)
+                return -1;
+
+            mb_type = MB_TYPE_16x16;
         }
-      }
-
-      /* compute motion vector */
-      if (mode == THIRDPEL_MODE) {
-        int fx, fy;
-        mx = ((mx + 1)>>1) + dx;
-        my = ((my + 1)>>1) + dy;
-        fx= ((unsigned)(mx + 0x3000))/3 - 0x1000;
-        fy= ((unsigned)(my + 0x3000))/3 - 0x1000;
-        dxy= (mx - 3*fx) + 4*(my - 3*fy);
-
-        svq3_mc_dir_part (s, x, y, part_width, part_height, fx, fy, dxy, 1, dir, avg);
-        mx += mx;
-        my += my;
-      } else if (mode == HALFPEL_MODE || mode == PREDICT_MODE) {
-        mx = ((unsigned)(mx + 1 + 0x3000))/3 + dx - 0x1000;
-        my = ((unsigned)(my + 1 + 0x3000))/3 + dy - 0x1000;
-        dxy= (mx&1) + 2*(my&1);
-
-        svq3_mc_dir_part (s, x, y, part_width, part_height, mx>>1, my>>1, dxy, 0, dir, avg);
-        mx *= 3;
-        my *= 3;
-      } else {
-        mx = ((unsigned)(mx + 3 + 0x6000))/6 + dx - 0x1000;
-        my = ((unsigned)(my + 3 + 0x6000))/6 + dy - 0x1000;
-
-        svq3_mc_dir_part (s, x, y, part_width, part_height, mx, my, 0, 0, dir, avg);
-        mx *= 6;
-        my *= 6;
-      }
-
-      /* update mv_cache */
-      if (mode != PREDICT_MODE) {
-        int32_t mv = pack16to32(mx,my);
-
-        if (part_height == 8 && i < 8) {
-          *(int32_t *) h->mv_cache[dir][scan8[k] + 1*8] = mv;
-
-          if (part_width == 8 && j < 8) {
-            *(int32_t *) h->mv_cache[dir][scan8[k] + 1 + 1*8] = mv;
-          }
+    } else if (mb_type < 8) {     /* INTER */
+        if (h->thirdpel_flag && h->halfpel_flag == !get_bits1 (&s->gb)) {
+            mode = THIRDPEL_MODE;
+        } else if (h->halfpel_flag && h->thirdpel_flag == !get_bits1 (&s->gb)) {
+            mode = HALFPEL_MODE;
+        } else {
+            mode = FULLPEL_MODE;
         }
-        if (part_width == 8 && j < 8) {
-          *(int32_t *) h->mv_cache[dir][scan8[k] + 1] = mv;
+
+        /* fill caches */
+        /* note ref_cache should contain here:
+            ????????
+            ???11111
+            N??11111
+            N??11111
+            N??11111
+        */
+
+        for (m = 0; m < 2; m++) {
+            if (s->mb_x > 0 && h->intra4x4_pred_mode[mb_xy - 1][0] != -1) {
+                for (i = 0; i < 4; i++) {
+                    *(uint32_t *) h->mv_cache[m][scan8[0] - 1 + i*8] = *(uint32_t *) s->current_picture.motion_val[m][b_xy - 1 + i*h->b_stride];
+                }
+            } else {
+                for (i = 0; i < 4; i++) {
+                    *(uint32_t *) h->mv_cache[m][scan8[0] - 1 + i*8] = 0;
+                }
+            }
+            if (s->mb_y > 0) {
+                memcpy(h->mv_cache[m][scan8[0] - 1*8], s->current_picture.motion_val[m][b_xy - h->b_stride], 4*2*sizeof(int16_t));
+                memset(&h->ref_cache[m][scan8[0] - 1*8], (h->intra4x4_pred_mode[mb_xy - s->mb_stride][4] == -1) ? PART_NOT_AVAILABLE : 1, 4);
+
+                if (s->mb_x < (s->mb_width - 1)) {
+                    *(uint32_t *) h->mv_cache[m][scan8[0] + 4 - 1*8] = *(uint32_t *) s->current_picture.motion_val[m][b_xy - h->b_stride + 4];
+                    h->ref_cache[m][scan8[0] + 4 - 1*8] =
+                        (h->intra4x4_pred_mode[mb_xy - s->mb_stride + 1][0] == -1 ||
+                         h->intra4x4_pred_mode[mb_xy - s->mb_stride    ][4] == -1) ? PART_NOT_AVAILABLE : 1;
+                }else
+                    h->ref_cache[m][scan8[0] + 4 - 1*8] = PART_NOT_AVAILABLE;
+                if (s->mb_x > 0) {
+                    *(uint32_t *) h->mv_cache[m][scan8[0] - 1 - 1*8] = *(uint32_t *) s->current_picture.motion_val[m][b_xy - h->b_stride - 1];
+                    h->ref_cache[m][scan8[0] - 1 - 1*8] = (h->intra4x4_pred_mode[mb_xy - s->mb_stride - 1][3] == -1) ? PART_NOT_AVAILABLE : 1;
+                }else
+                    h->ref_cache[m][scan8[0] - 1 - 1*8] = PART_NOT_AVAILABLE;
+            }else
+                memset(&h->ref_cache[m][scan8[0] - 1*8 - 1], PART_NOT_AVAILABLE, 8);
+
+            if (s->pict_type != FF_B_TYPE)
+                break;
         }
-        if (part_width == 4 || part_height == 4) {
-          *(int32_t *) h->mv_cache[dir][scan8[k]] = mv;
+
+        /* decode motion vector(s) and form prediction(s) */
+        if (s->pict_type == FF_P_TYPE) {
+            if (svq3_mc_dir(h, (mb_type - 1), mode, 0, 0) < 0)
+                return -1;
+        } else {        /* FF_B_TYPE */
+            if (mb_type != 2) {
+                if (svq3_mc_dir(h, 0, mode, 0, 0) < 0)
+                    return -1;
+            } else {
+                for (i = 0; i < 4; i++) {
+                    memset(s->current_picture.motion_val[0][b_xy + i*h->b_stride], 0, 4*2*sizeof(int16_t));
+                }
+            }
+            if (mb_type != 1) {
+                if (svq3_mc_dir(h, 0, mode, 1, (mb_type == 3)) < 0)
+                    return -1;
+            } else {
+                for (i = 0; i < 4; i++) {
+                    memset(s->current_picture.motion_val[1][b_xy + i*h->b_stride], 0, 4*2*sizeof(int16_t));
+                }
+            }
         }
-      }
 
-      /* write back motion vectors */
-      fill_rectangle(s->current_picture.motion_val[dir][b_xy], part_width>>2, part_height>>2, h->b_stride, pack16to32(mx,my), 4);
-    }
-  }
+        mb_type = MB_TYPE_16x16;
+    } else if (mb_type == 8 || mb_type == 33) {   /* INTRA4x4 */
+        memset(h->intra4x4_pred_mode_cache, -1, 8*5*sizeof(int8_t));
+
+        if (mb_type == 8) {
+            if (s->mb_x > 0) {
+                for (i = 0; i < 4; i++) {
+                    h->intra4x4_pred_mode_cache[scan8[0] - 1 + i*8] = h->intra4x4_pred_mode[mb_xy - 1][i];
+                }
+                if (h->intra4x4_pred_mode_cache[scan8[0] - 1] == -1) {
+                    h->left_samples_available = 0x5F5F;
+                }
+            }
+            if (s->mb_y > 0) {
+                h->intra4x4_pred_mode_cache[4+8*0] = h->intra4x4_pred_mode[mb_xy - s->mb_stride][4];
+                h->intra4x4_pred_mode_cache[5+8*0] = h->intra4x4_pred_mode[mb_xy - s->mb_stride][5];
+                h->intra4x4_pred_mode_cache[6+8*0] = h->intra4x4_pred_mode[mb_xy - s->mb_stride][6];
+                h->intra4x4_pred_mode_cache[7+8*0] = h->intra4x4_pred_mode[mb_xy - s->mb_stride][3];
+
+                if (h->intra4x4_pred_mode_cache[4+8*0] == -1) {
+                    h->top_samples_available = 0x33FF;
+                }
+            }
+
+            /* decode prediction codes for luma blocks */
+            for (i = 0; i < 16; i+=2) {
+                vlc = svq3_get_ue_golomb(&s->gb);
+
+                if (vlc >= 25){
+                    av_log(h->s.avctx, AV_LOG_ERROR, "luma prediction:%d\n", vlc);
+                    return -1;
+                }
+
+                left    = &h->intra4x4_pred_mode_cache[scan8[i] - 1];
+                top     = &h->intra4x4_pred_mode_cache[scan8[i] - 8];
+
+                left[1] = svq3_pred_1[top[0] + 1][left[0] + 1][svq3_pred_0[vlc][0]];
+                left[2] = svq3_pred_1[top[1] + 1][left[1] + 1][svq3_pred_0[vlc][1]];
+
+                if (left[1] == -1 || left[2] == -1){
+                    av_log(h->s.avctx, AV_LOG_ERROR, "weird prediction\n");
+                    return -1;
+                }
+            }
+        } else {    /* mb_type == 33, DC_128_PRED block type */
+            for (i = 0; i < 4; i++) {
+                memset(&h->intra4x4_pred_mode_cache[scan8[0] + 8*i], DC_PRED, 4);
+            }
+        }
 
-  return 0;
-}
+        write_back_intra_pred_mode(h);
 
-static int svq3_decode_mb (H264Context *h, unsigned int mb_type) {
-  int i, j, k, m, dir, mode;
-  int cbp = 0;
-  uint32_t vlc;
-  int8_t *top, *left;
-  MpegEncContext *const s = (MpegEncContext *) h;
-  const int mb_xy = h->mb_xy;
-  const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
+        if (mb_type == 8) {
+            check_intra4x4_pred_mode(h);
 
-  h->top_samples_available        = (s->mb_y == 0) ? 0x33FF : 0xFFFF;
-  h->left_samples_available        = (s->mb_x == 0) ? 0x5F5F : 0xFFFF;
-  h->topright_samples_available        = 0xFFFF;
+            h->top_samples_available  = (s->mb_y == 0) ? 0x33FF : 0xFFFF;
+            h->left_samples_available = (s->mb_x == 0) ? 0x5F5F : 0xFFFF;
+        } else {
+            for (i = 0; i < 4; i++) {
+                memset(&h->intra4x4_pred_mode_cache[scan8[0] + 8*i], DC_128_PRED, 4);
+            }
 
-  if (mb_type == 0) {           /* SKIP */
-    if (s->pict_type == FF_P_TYPE || s->next_picture.mb_type[mb_xy] == -1) {
-      svq3_mc_dir_part (s, 16*s->mb_x, 16*s->mb_y, 16, 16, 0, 0, 0, 0, 0, 0);
+            h->top_samples_available  = 0x33FF;
+            h->left_samples_available = 0x5F5F;
+        }
 
-      if (s->pict_type == FF_B_TYPE) {
-        svq3_mc_dir_part (s, 16*s->mb_x, 16*s->mb_y, 16, 16, 0, 0, 0, 0, 1, 1);
-      }
+        mb_type = MB_TYPE_INTRA4x4;
+    } else {                      /* INTRA16x16 */
+        dir = i_mb_type_info[mb_type - 8].pred_mode;
+        dir = (dir >> 1) ^ 3*(dir & 1) ^ 1;
 
-      mb_type = MB_TYPE_SKIP;
-    } else {
-      mb_type= FFMIN(s->next_picture.mb_type[mb_xy], 6);
-      if(svq3_mc_dir (h, mb_type, PREDICT_MODE, 0, 0) < 0)
-        return -1;
-      if(svq3_mc_dir (h, mb_type, PREDICT_MODE, 1, 1) < 0)
-        return -1;
+        if ((h->intra16x16_pred_mode = check_intra_pred_mode(h, dir)) == -1){
+            av_log(h->s.avctx, AV_LOG_ERROR, "check_intra_pred_mode = -1\n");
+            return -1;
+        }
 
-      mb_type = MB_TYPE_16x16;
-    }
-  } else if (mb_type < 8) {     /* INTER */
-    if (h->thirdpel_flag && h->halfpel_flag == !get_bits1 (&s->gb)) {
-      mode = THIRDPEL_MODE;
-    } else if (h->halfpel_flag && h->thirdpel_flag == !get_bits1 (&s->gb)) {
-      mode = HALFPEL_MODE;
-    } else {
-      mode = FULLPEL_MODE;
+        cbp = i_mb_type_info[mb_type - 8].cbp;
+        mb_type = MB_TYPE_INTRA16x16;
     }
 
-    /* fill caches */
-    /* note ref_cache should contain here:
-        ????????
-        ???11111
-        N??11111
-        N??11111
-        N??11111
-    */
-
-    for (m=0; m < 2; m++) {
-      if (s->mb_x > 0 && h->intra4x4_pred_mode[mb_xy - 1][0] != -1) {
-        for (i=0; i < 4; i++) {
-          *(uint32_t *) h->mv_cache[m][scan8[0] - 1 + i*8] = *(uint32_t *) s->current_picture.motion_val[m][b_xy - 1 + i*h->b_stride];
+    if (!IS_INTER(mb_type) && s->pict_type != FF_I_TYPE) {
+        for (i = 0; i < 4; i++) {
+            memset(s->current_picture.motion_val[0][b_xy + i*h->b_stride], 0, 4*2*sizeof(int16_t));
         }
-      } else {
-        for (i=0; i < 4; i++) {
-          *(uint32_t *) h->mv_cache[m][scan8[0] - 1 + i*8] = 0;
+        if (s->pict_type == FF_B_TYPE) {
+            for (i = 0; i < 4; i++) {
+                memset(s->current_picture.motion_val[1][b_xy + i*h->b_stride], 0, 4*2*sizeof(int16_t));
+            }
         }
-      }
-      if (s->mb_y > 0) {
-        memcpy (h->mv_cache[m][scan8[0] - 1*8], s->current_picture.motion_val[m][b_xy - h->b_stride], 4*2*sizeof(int16_t));
-        memset (&h->ref_cache[m][scan8[0] - 1*8], (h->intra4x4_pred_mode[mb_xy - s->mb_stride][4] == -1) ? PART_NOT_AVAILABLE : 1, 4);
-
-        if (s->mb_x < (s->mb_width - 1)) {
-          *(uint32_t *) h->mv_cache[m][scan8[0] + 4 - 1*8] = *(uint32_t *) s->current_picture.motion_val[m][b_xy - h->b_stride + 4];
-          h->ref_cache[m][scan8[0] + 4 - 1*8] =
-                  (h->intra4x4_pred_mode[mb_xy - s->mb_stride + 1][0] == -1 ||
-                   h->intra4x4_pred_mode[mb_xy - s->mb_stride][4] == -1) ? PART_NOT_AVAILABLE : 1;
-        }else
-          h->ref_cache[m][scan8[0] + 4 - 1*8] = PART_NOT_AVAILABLE;
-        if (s->mb_x > 0) {
-          *(uint32_t *) h->mv_cache[m][scan8[0] - 1 - 1*8] = *(uint32_t *) s->current_picture.motion_val[m][b_xy - h->b_stride - 1];
-          h->ref_cache[m][scan8[0] - 1 - 1*8] = (h->intra4x4_pred_mode[mb_xy - s->mb_stride - 1][3] == -1) ? PART_NOT_AVAILABLE : 1;
-        }else
-          h->ref_cache[m][scan8[0] - 1 - 1*8] = PART_NOT_AVAILABLE;
-      }else
-        memset (&h->ref_cache[m][scan8[0] - 1*8 - 1], PART_NOT_AVAILABLE, 8);
-
-      if (s->pict_type != FF_B_TYPE)
-        break;
+    }
+    if (!IS_INTRA4x4(mb_type)) {
+        memset(h->intra4x4_pred_mode[mb_xy], DC_PRED, 8);
+    }
+    if (!IS_SKIP(mb_type) || s->pict_type == FF_B_TYPE) {
+        memset(h->non_zero_count_cache + 8, 0, 4*9*sizeof(uint8_t));
+        s->dsp.clear_blocks(h->mb);
     }
 
-    /* decode motion vector(s) and form prediction(s) */
-    if (s->pict_type == FF_P_TYPE) {
-      if(svq3_mc_dir (h, (mb_type - 1), mode, 0, 0) < 0)
-        return -1;
-    } else {        /* FF_B_TYPE */
-      if (mb_type != 2) {
-        if(svq3_mc_dir (h, 0, mode, 0, 0) < 0)
-          return -1;
-      } else {
-        for (i=0; i < 4; i++) {
-          memset (s->current_picture.motion_val[0][b_xy + i*h->b_stride], 0, 4*2*sizeof(int16_t));
-        }
-      }
-      if (mb_type != 1) {
-        if(svq3_mc_dir (h, 0, mode, 1, (mb_type == 3)) < 0)
-          return -1;
-      } else {
-        for (i=0; i < 4; i++) {
-          memset (s->current_picture.motion_val[1][b_xy + i*h->b_stride], 0, 4*2*sizeof(int16_t));
+    if (!IS_INTRA16x16(mb_type) && (!IS_SKIP(mb_type) || s->pict_type == FF_B_TYPE)) {
+        if ((vlc = svq3_get_ue_golomb(&s->gb)) >= 48){
+            av_log(h->s.avctx, AV_LOG_ERROR, "cbp_vlc=%d\n", vlc);
+            return -1;
         }
-      }
-    }
 
-    mb_type = MB_TYPE_16x16;
-  } else if (mb_type == 8 || mb_type == 33) {   /* INTRA4x4 */
-    memset (h->intra4x4_pred_mode_cache, -1, 8*5*sizeof(int8_t));
+        cbp = IS_INTRA(mb_type) ? golomb_to_intra4x4_cbp[vlc] : golomb_to_inter_cbp[vlc];
+    }
+    if (IS_INTRA16x16(mb_type) || (s->pict_type != FF_I_TYPE && s->adaptive_quant && cbp)) {
+        s->qscale += svq3_get_se_golomb(&s->gb);
 
-    if (mb_type == 8) {
-      if (s->mb_x > 0) {
-        for (i=0; i < 4; i++) {
-          h->intra4x4_pred_mode_cache[scan8[0] - 1 + i*8] = h->intra4x4_pred_mode[mb_xy - 1][i];
-        }
-        if (h->intra4x4_pred_mode_cache[scan8[0] - 1] == -1) {
-          h->left_samples_available = 0x5F5F;
+        if (s->qscale > 31){
+            av_log(h->s.avctx, AV_LOG_ERROR, "qscale:%d\n", s->qscale);
+            return -1;
         }
-      }
-      if (s->mb_y > 0) {
-        h->intra4x4_pred_mode_cache[4+8*0] = h->intra4x4_pred_mode[mb_xy - s->mb_stride][4];
-        h->intra4x4_pred_mode_cache[5+8*0] = h->intra4x4_pred_mode[mb_xy - s->mb_stride][5];
-        h->intra4x4_pred_mode_cache[6+8*0] = h->intra4x4_pred_mode[mb_xy - s->mb_stride][6];
-        h->intra4x4_pred_mode_cache[7+8*0] = h->intra4x4_pred_mode[mb_xy - s->mb_stride][3];
-
-        if (h->intra4x4_pred_mode_cache[4+8*0] == -1) {
-          h->top_samples_available = 0x33FF;
+    }
+    if (IS_INTRA16x16(mb_type)) {
+        if (svq3_decode_block(&s->gb, h->mb, 0, 0)){
+            av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding intra luma dc\n");
+            return -1;
         }
-      }
-
-      /* decode prediction codes for luma blocks */
-      for (i=0; i < 16; i+=2) {
-        vlc = svq3_get_ue_golomb (&s->gb);
+    }
 
-        if (vlc >= 25){
-          av_log(h->s.avctx, AV_LOG_ERROR, "luma prediction:%d\n", vlc);
-          return -1;
+    if (cbp) {
+        const int index = IS_INTRA16x16(mb_type) ? 1 : 0;
+        const int type = ((s->qscale < 24 && IS_INTRA4x4(mb_type)) ? 2 : 1);
+
+        for (i = 0; i < 4; i++) {
+            if ((cbp & (1 << i))) {
+                for (j = 0; j < 4; j++) {
+                    k = index ? ((j&1) + 2*(i&1) + 2*(j&2) + 4*(i&2)) : (4*i + j);
+                    h->non_zero_count_cache[ scan8[k] ] = 1;
+
+                    if (svq3_decode_block(&s->gb, &h->mb[16*k], index, type)){
+                        av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding block\n");
+                        return -1;
+                    }
+                }
+            }
         }
 
-        left    = &h->intra4x4_pred_mode_cache[scan8[i] - 1];
-        top     = &h->intra4x4_pred_mode_cache[scan8[i] - 8];
+        if ((cbp & 0x30)) {
+            for (i = 0; i < 2; ++i) {
+              if (svq3_decode_block(&s->gb, &h->mb[16*(16 + 4*i)], 0, 3)){
+                av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma dc block\n");
+                return -1;
+              }
+            }
+
+            if ((cbp & 0x20)) {
+                for (i = 0; i < 8; i++) {
+                    h->non_zero_count_cache[ scan8[16+i] ] = 1;
+
+                    if (svq3_decode_block(&s->gb, &h->mb[16*(16 + i)], 1, 1)){
+                        av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma ac block\n");
+                        return -1;
+                    }
+                }
+            }
+        }
+    }
 
-        left[1] = svq3_pred_1[top[0] + 1][left[0] + 1][svq3_pred_0[vlc][0]];
-        left[2] = svq3_pred_1[top[1] + 1][left[1] + 1][svq3_pred_0[vlc][1]];
+    h->cbp= cbp;
+    s->current_picture.mb_type[mb_xy] = mb_type;
 
-        if (left[1] == -1 || left[2] == -1){
-          av_log(h->s.avctx, AV_LOG_ERROR, "weird prediction\n");
-          return -1;
-        }
-      }
-    } else {    /* mb_type == 33, DC_128_PRED block type */
-      for (i=0; i < 4; i++) {
-        memset (&h->intra4x4_pred_mode_cache[scan8[0] + 8*i], DC_PRED, 4);
-      }
+    if (IS_INTRA(mb_type)) {
+        h->chroma_pred_mode = check_intra_pred_mode(h, DC_PRED8x8);
     }
 
-    write_back_intra_pred_mode (h);
+    return 0;
+}
 
-    if (mb_type == 8) {
-      check_intra4x4_pred_mode (h);
+static int svq3_decode_slice_header(H264Context *h)
+{
+    MpegEncContext *const s = (MpegEncContext *) h;
+    const int mb_xy = h->mb_xy;
+    int i, header;
 
-      h->top_samples_available  = (s->mb_y == 0) ? 0x33FF : 0xFFFF;
-      h->left_samples_available = (s->mb_x == 0) ? 0x5F5F : 0xFFFF;
-    } else {
-      for (i=0; i < 4; i++) {
-        memset (&h->intra4x4_pred_mode_cache[scan8[0] + 8*i], DC_128_PRED, 4);
-      }
+    header = get_bits(&s->gb, 8);
 
-      h->top_samples_available  = 0x33FF;
-      h->left_samples_available = 0x5F5F;
-    }
+    if (((header & 0x9F) != 1 && (header & 0x9F) != 2) || (header & 0x60) == 0) {
+        /* TODO: what? */
+        av_log(h->s.avctx, AV_LOG_ERROR, "unsupported slice header (%02X)\n", header);
+        return -1;
+    } else {
+        int length = (header >> 5) & 3;
 
-    mb_type = MB_TYPE_INTRA4x4;
-  } else {                      /* INTRA16x16 */
-    dir = i_mb_type_info[mb_type - 8].pred_mode;
-    dir = (dir >> 1) ^ 3*(dir & 1) ^ 1;
+        h->next_slice_index = get_bits_count(&s->gb) + 8*show_bits(&s->gb, 8*length) + 8*length;
 
-    if ((h->intra16x16_pred_mode = check_intra_pred_mode (h, dir)) == -1){
-      av_log(h->s.avctx, AV_LOG_ERROR, "check_intra_pred_mode = -1\n");
-      return -1;
+        if (h->next_slice_index > s->gb.size_in_bits) {
+            av_log(h->s.avctx, AV_LOG_ERROR, "slice after bitstream end\n");
+            return -1;
     }
 
-    cbp = i_mb_type_info[mb_type - 8].cbp;
-    mb_type = MB_TYPE_INTRA16x16;
-  }
+        s->gb.size_in_bits = h->next_slice_index - 8*(length - 1);
+        skip_bits(&s->gb, 8);
 
-  if (!IS_INTER(mb_type) && s->pict_type != FF_I_TYPE) {
-    for (i=0; i < 4; i++) {
-      memset (s->current_picture.motion_val[0][b_xy + i*h->b_stride], 0, 4*2*sizeof(int16_t));
-    }
-    if (s->pict_type == FF_B_TYPE) {
-      for (i=0; i < 4; i++) {
-        memset (s->current_picture.motion_val[1][b_xy + i*h->b_stride], 0, 4*2*sizeof(int16_t));
-      }
+        if (h->svq3_watermark_key) {
+            uint32_t header = AV_RL32(&s->gb.buffer[(get_bits_count(&s->gb)>>3)+1]);
+            AV_WL32(&s->gb.buffer[(get_bits_count(&s->gb)>>3)+1], header ^ h->svq3_watermark_key);
+        }
+        if (length > 0) {
+            memcpy((uint8_t *) &s->gb.buffer[get_bits_count(&s->gb) >> 3],
+                   &s->gb.buffer[s->gb.size_in_bits >> 3], (length - 1));
+        }
     }
-  }
-  if (!IS_INTRA4x4(mb_type)) {
-    memset (h->intra4x4_pred_mode[mb_xy], DC_PRED, 8);
-  }
-  if (!IS_SKIP(mb_type) || s->pict_type == FF_B_TYPE) {
-    memset (h->non_zero_count_cache + 8, 0, 4*9*sizeof(uint8_t));
-    s->dsp.clear_blocks(h->mb);
-  }
-
-  if (!IS_INTRA16x16(mb_type) && (!IS_SKIP(mb_type) || s->pict_type == FF_B_TYPE)) {
-    if ((vlc = svq3_get_ue_golomb (&s->gb)) >= 48){
-      av_log(h->s.avctx, AV_LOG_ERROR, "cbp_vlc=%d\n", vlc);
-      return -1;
+
+    if ((i = svq3_get_ue_golomb(&s->gb)) == INVALID_VLC || i >= 3){
+        av_log(h->s.avctx, AV_LOG_ERROR, "illegal slice type %d \n", i);
+        return -1;
     }
 
-    cbp = IS_INTRA(mb_type) ? golomb_to_intra4x4_cbp[vlc] : golomb_to_inter_cbp[vlc];
-  }
-  if (IS_INTRA16x16(mb_type) || (s->pict_type != FF_I_TYPE && s->adaptive_quant && cbp)) {
-    s->qscale += svq3_get_se_golomb (&s->gb);
+    h->slice_type = golomb_to_pict_type[i];
 
-    if (s->qscale > 31){
-      av_log(h->s.avctx, AV_LOG_ERROR, "qscale:%d\n", s->qscale);
-      return -1;
-    }
-  }
-  if (IS_INTRA16x16(mb_type)) {
-    if (svq3_decode_block (&s->gb, h->mb, 0, 0)){
-      av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding intra luma dc\n");
-      return -1;
+    if ((header & 0x9F) == 2) {
+        i = (s->mb_num < 64) ? 6 : (1 + av_log2 (s->mb_num - 1));
+        s->mb_skip_run = get_bits(&s->gb, i) - (s->mb_x + (s->mb_y * s->mb_width));
+    } else {
+        skip_bits1(&s->gb);
+        s->mb_skip_run = 0;
     }
-  }
 
-  if (cbp) {
-    const int index = IS_INTRA16x16(mb_type) ? 1 : 0;
-    const int type = ((s->qscale < 24 && IS_INTRA4x4(mb_type)) ? 2 : 1);
+    h->slice_num = get_bits(&s->gb, 8);
+    s->qscale = get_bits(&s->gb, 5);
+    s->adaptive_quant = get_bits1(&s->gb);
 
-    for (i=0; i < 4; i++) {
-      if ((cbp & (1 << i))) {
-        for (j=0; j < 4; j++) {
-          k = index ? ((j&1) + 2*(i&1) + 2*(j&2) + 4*(i&2)) : (4*i + j);
-          h->non_zero_count_cache[ scan8[k] ] = 1;
+    /* unknown fields */
+    skip_bits1(&s->gb);
 
-          if (svq3_decode_block (&s->gb, &h->mb[16*k], index, type)){
-            av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding block\n");
-            return -1;
-          }
-        }
-      }
+    if (h->unknown_svq3_flag) {
+        skip_bits1(&s->gb);
     }
 
-    if ((cbp & 0x30)) {
-      for (i=0; i < 2; ++i) {
-        if (svq3_decode_block (&s->gb, &h->mb[16*(16 + 4*i)], 0, 3)){
-          av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma dc block\n");
-          return -1;
-        }
-      }
-
-      if ((cbp & 0x20)) {
-        for (i=0; i < 8; i++) {
-          h->non_zero_count_cache[ scan8[16+i] ] = 1;
+    skip_bits1(&s->gb);
+    skip_bits(&s->gb, 2);
 
-          if (svq3_decode_block (&s->gb, &h->mb[16*(16 + i)], 1, 1)){
-            av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma ac block\n");
-            return -1;
-          }
-        }
-      }
+    while (get_bits1(&s->gb)) {
+        skip_bits(&s->gb, 8);
     }
-  }
 
-  s->current_picture.mb_type[mb_xy] = mb_type;
+    /* reset intra predictors and invalidate motion vector references */
+    if (s->mb_x > 0) {
+        memset(h->intra4x4_pred_mode[mb_xy - 1], -1, 4*sizeof(int8_t));
+        memset(h->intra4x4_pred_mode[mb_xy - s->mb_x], -1, 8*sizeof(int8_t)*s->mb_x);
+    }
+    if (s->mb_y > 0) {
+        memset(h->intra4x4_pred_mode[mb_xy - s->mb_stride], -1, 8*sizeof(int8_t)*(s->mb_width - s->mb_x));
 
-  if (IS_INTRA(mb_type)) {
-    h->chroma_pred_mode = check_intra_pred_mode (h, DC_PRED8x8);
-  }
+        if (s->mb_x > 0) {
+            h->intra4x4_pred_mode[mb_xy - s->mb_stride - 1][3] = -1;
+        }
+    }
 
-  return 0;
+    return 0;
 }
 
-static int svq3_decode_slice_header (H264Context *h) {
-  MpegEncContext *const s = (MpegEncContext *) h;
-  const int mb_xy = h->mb_xy;
-  int i, header;
+static int svq3_decode_init(AVCodecContext *avctx)
+{
+    MpegEncContext *const s = avctx->priv_data;
+    H264Context *const h = avctx->priv_data;
+    int m;
+    unsigned char *extradata;
+    unsigned int size;
 
-  header = get_bits (&s->gb, 8);
+    if (decode_init(avctx) < 0)
+        return -1;
 
-  if (((header & 0x9F) != 1 && (header & 0x9F) != 2) || (header & 0x60) == 0) {
-    /* TODO: what? */
-    av_log(h->s.avctx, AV_LOG_ERROR, "unsupported slice header (%02X)\n", header);
-    return -1;
-  } else {
-    int length = (header >> 5) & 3;
+    s->flags  = avctx->flags;
+    s->flags2 = avctx->flags2;
+    s->unrestricted_mv = 1;
+    h->is_complex=1;
 
-    h->next_slice_index = get_bits_count(&s->gb) + 8*show_bits (&s->gb, 8*length) + 8*length;
+    if (!s->context_initialized) {
+        s->width  = avctx->width;
+        s->height = avctx->height;
+        h->halfpel_flag      = 1;
+        h->thirdpel_flag     = 1;
+        h->unknown_svq3_flag = 0;
+        h->chroma_qp[0]      = h->chroma_qp[1] = 4;
 
-    if (h->next_slice_index > s->gb.size_in_bits){
-      av_log(h->s.avctx, AV_LOG_ERROR, "slice after bitstream end\n");
-      return -1;
-    }
+        if (MPV_common_init(s) < 0)
+            return -1;
 
-    s->gb.size_in_bits = h->next_slice_index - 8*(length - 1);
-    skip_bits(&s->gb, 8);
+        h->b_stride = 4*s->mb_width;
 
-    if (h->svq3_watermark_key) {
-        uint32_t header = AV_RL32(&s->gb.buffer[(get_bits_count(&s->gb)>>3)+1]);
-        AV_WL32(&s->gb.buffer[(get_bits_count(&s->gb)>>3)+1], header ^ h->svq3_watermark_key);
-    }
-    if (length > 0) {
-      memcpy ((uint8_t *) &s->gb.buffer[get_bits_count(&s->gb) >> 3],
-             &s->gb.buffer[s->gb.size_in_bits >> 3], (length - 1));
-    }
-  }
+        alloc_tables(h);
 
-  if ((i = svq3_get_ue_golomb (&s->gb)) == INVALID_VLC || i >= 3){
-    av_log(h->s.avctx, AV_LOG_ERROR, "illegal slice type %d \n", i);
-    return -1;
-  }
+        /* prowl for the "SEQH" marker in the extradata */
+        extradata = (unsigned char *)avctx->extradata;
+        for (m = 0; m < avctx->extradata_size; m++) {
+            if (!memcmp(extradata, "SEQH", 4))
+                break;
+            extradata++;
+        }
 
-  h->slice_type = golomb_to_pict_type[i];
+        /* if a match was found, parse the extra data */
+        if (extradata && !memcmp(extradata, "SEQH", 4)) {
 
-  if ((header & 0x9F) == 2) {
-    i = (s->mb_num < 64) ? 6 : (1 + av_log2 (s->mb_num - 1));
-    s->mb_skip_run = get_bits (&s->gb, i) - (s->mb_x + (s->mb_y * s->mb_width));
-  } else {
-    skip_bits1 (&s->gb);
-    s->mb_skip_run = 0;
-  }
+            GetBitContext gb;
 
-  h->slice_num = get_bits (&s->gb, 8);
-  s->qscale = get_bits (&s->gb, 5);
-  s->adaptive_quant = get_bits1 (&s->gb);
+            size = AV_RB32(&extradata[4]);
+            init_get_bits(&gb, extradata + 8, size*8);
 
-  /* unknown fields */
-  skip_bits1 (&s->gb);
+            /* 'frame size code' and optional 'width, height' */
+            if (get_bits(&gb, 3) == 7) {
+                skip_bits(&gb, 12);
+                skip_bits(&gb, 12);
+            }
 
-  if (h->unknown_svq3_flag) {
-    skip_bits1 (&s->gb);
-  }
+            h->halfpel_flag  = get_bits1(&gb);
+            h->thirdpel_flag = get_bits1(&gb);
 
-  skip_bits1 (&s->gb);
-  skip_bits (&s->gb, 2);
+            /* unknown fields */
+            skip_bits1(&gb);
+            skip_bits1(&gb);
+            skip_bits1(&gb);
+            skip_bits1(&gb);
 
-  while (get_bits1 (&s->gb)) {
-    skip_bits (&s->gb, 8);
-  }
+            s->low_delay = get_bits1(&gb);
 
-  /* reset intra predictors and invalidate motion vector references */
-  if (s->mb_x > 0) {
-    memset (h->intra4x4_pred_mode[mb_xy - 1], -1, 4*sizeof(int8_t));
-    memset (h->intra4x4_pred_mode[mb_xy - s->mb_x], -1, 8*sizeof(int8_t)*s->mb_x);
-  }
-  if (s->mb_y > 0) {
-    memset (h->intra4x4_pred_mode[mb_xy - s->mb_stride], -1, 8*sizeof(int8_t)*(s->mb_width - s->mb_x));
+            /* unknown field */
+            skip_bits1(&gb);
 
-    if (s->mb_x > 0) {
-      h->intra4x4_pred_mode[mb_xy - s->mb_stride - 1][3] = -1;
+            while (get_bits1(&gb)) {
+                skip_bits(&gb, 8);
+            }
+
+            h->unknown_svq3_flag = get_bits1(&gb);
+            avctx->has_b_frames = !s->low_delay;
+            if (h->unknown_svq3_flag) {
+#ifdef CONFIG_ZLIB
+                unsigned watermark_width  = svq3_get_ue_golomb(&gb);
+                unsigned watermark_height = svq3_get_ue_golomb(&gb);
+                int u1 = svq3_get_ue_golomb(&gb);
+                int u2 = get_bits(&gb, 8);
+                int u3 = get_bits(&gb, 2);
+                int u4 = svq3_get_ue_golomb(&gb);
+                unsigned buf_len = watermark_width*watermark_height*4;
+                int offset = (get_bits_count(&gb)+7)>>3;
+                uint8_t *buf;
+
+                if ((uint64_t)watermark_width*4 > UINT_MAX/watermark_height)
+                    return -1;
+
+                buf = av_malloc(buf_len);
+                av_log(avctx, AV_LOG_DEBUG, "watermark size: %dx%d\n", watermark_width, watermark_height);
+                av_log(avctx, AV_LOG_DEBUG, "u1: %x u2: %x u3: %x compressed data size: %d offset: %d\n", u1, u2, u3, u4, offset);
+                if (uncompress(buf, (uLong*)&buf_len, extradata + 8 + offset, size - offset) != Z_OK) {
+                    av_log(avctx, AV_LOG_ERROR, "could not uncompress watermark logo\n");
+                    av_free(buf);
+                    return -1;
+                }
+                h->svq3_watermark_key = ff_svq1_packet_checksum(buf, buf_len, 0);
+                h->svq3_watermark_key = h->svq3_watermark_key << 16 | h->svq3_watermark_key;
+                av_log(avctx, AV_LOG_DEBUG, "watermark key %#x\n", h->svq3_watermark_key);
+                av_free(buf);
+#else
+                av_log(avctx, AV_LOG_ERROR, "this svq3 file contains watermark which need zlib support compiled in\n");
+                return -1;
+#endif
+            }
+        }
     }
-  }
 
-  return 0;
+    return 0;
 }
 
-static int svq3_decode_frame (AVCodecContext *avctx,
-                              void *data, int *data_size,
-                              const uint8_t *buf, int buf_size) {
-  MpegEncContext *const s = avctx->priv_data;
-  H264Context *const h = avctx->priv_data;
-  int m, mb_type;
-  unsigned char *extradata;
-  unsigned int size;
-
-  s->flags = avctx->flags;
-  s->flags2 = avctx->flags2;
-  s->unrestricted_mv = 1;
-
-  if (!s->context_initialized) {
-    s->width = avctx->width;
-    s->height = avctx->height;
-    h->halfpel_flag = 1;
-    h->thirdpel_flag = 1;
-    h->unknown_svq3_flag = 0;
-    h->chroma_qp[0] = h->chroma_qp[1] = 4;
-
-    if (MPV_common_init (s) < 0)
-      return -1;
-
-    h->b_stride = 4*s->mb_width;
-
-    alloc_tables (h);
-
-    /* prowl for the "SEQH" marker in the extradata */
-    extradata = (unsigned char *)avctx->extradata;
-    for (m = 0; m < avctx->extradata_size; m++) {
-      if (!memcmp (extradata, "SEQH", 4))
-        break;
-      extradata++;
+static int svq3_decode_frame(AVCodecContext *avctx,
+                             void *data, int *data_size,
+                             const uint8_t *buf, int buf_size)
+{
+    MpegEncContext *const s = avctx->priv_data;
+    H264Context *const h = avctx->priv_data;
+    int m, mb_type;
+
+    /* special case for last picture */
+    if (buf_size == 0) {
+        if (s->next_picture_ptr && !s->low_delay) {
+            *(AVFrame *) data = *(AVFrame *) &s->next_picture;
+            s->next_picture_ptr = NULL;
+            *data_size = sizeof(AVFrame);
+        }
+        return 0;
     }
 
-    /* if a match was found, parse the extra data */
-    if (extradata && !memcmp (extradata, "SEQH", 4)) {
+    init_get_bits (&s->gb, buf, 8*buf_size);
 
-      GetBitContext gb;
+    s->mb_x = s->mb_y = h->mb_xy = 0;
 
-      size = AV_RB32(&extradata[4]);
-      init_get_bits (&gb, extradata + 8, size*8);
-
-      /* 'frame size code' and optional 'width, height' */
-      if (get_bits (&gb, 3) == 7) {
-        skip_bits (&gb, 12);
-        skip_bits (&gb, 12);
-      }
-
-      h->halfpel_flag = get_bits1 (&gb);
-      h->thirdpel_flag = get_bits1 (&gb);
-
-      /* unknown fields */
-      skip_bits1 (&gb);
-      skip_bits1 (&gb);
-      skip_bits1 (&gb);
-      skip_bits1 (&gb);
-
-      s->low_delay = get_bits1 (&gb);
-
-      /* unknown field */
-      skip_bits1 (&gb);
-
-      while (get_bits1 (&gb)) {
-        skip_bits (&gb, 8);
-      }
+    if (svq3_decode_slice_header(h))
+        return -1;
 
-      h->unknown_svq3_flag = get_bits1 (&gb);
-      avctx->has_b_frames = !s->low_delay;
-      if (h->unknown_svq3_flag) {
-#ifdef CONFIG_ZLIB
-          unsigned watermark_width  = svq3_get_ue_golomb(&gb);
-          unsigned watermark_height = svq3_get_ue_golomb(&gb);
-          int u1 = svq3_get_ue_golomb(&gb);
-          int u2 = get_bits(&gb, 8);
-          int u3 = get_bits(&gb, 2);
-          int u4 = svq3_get_ue_golomb(&gb);
-          unsigned buf_len = watermark_width*watermark_height*4;
-          int offset = (get_bits_count(&gb)+7)>>3;
-          uint8_t *buf;
-
-          if ((uint64_t)watermark_width*4 > UINT_MAX/watermark_height)
-              return -1;
+    s->pict_type = h->slice_type;
+    s->picture_number = h->slice_num;
 
-          buf = av_malloc(buf_len);
-          av_log(avctx, AV_LOG_DEBUG, "watermark size: %dx%d\n", watermark_width, watermark_height);
-          av_log(avctx, AV_LOG_DEBUG, "u1: %x u2: %x u3: %x compressed data size: %d offset: %d\n", u1, u2, u3, u4, offset);
-          if (uncompress(buf, (uLong*)&buf_len, extradata + 8 + offset, size - offset) != Z_OK) {
-              av_log(avctx, AV_LOG_ERROR, "could not uncompress watermark logo\n");
-              av_free(buf);
-              return -1;
-          }
-          h->svq3_watermark_key = ff_svq1_packet_checksum(buf, buf_len, 0);
-          h->svq3_watermark_key = h->svq3_watermark_key << 16 | h->svq3_watermark_key;
-          av_log(avctx, AV_LOG_DEBUG, "watermark key %#x\n", h->svq3_watermark_key);
-          av_free(buf);
-#else
-          av_log(avctx, AV_LOG_ERROR, "this svq3 file contains watermark which need zlib support compiled in\n");
-          return -1;
-#endif
-      }
+    if (avctx->debug&FF_DEBUG_PICT_INFO){
+        av_log(h->s.avctx, AV_LOG_DEBUG, "%c hpel:%d, tpel:%d aqp:%d qp:%d, slice_num:%02X\n",
+               av_get_pict_type_char(s->pict_type), h->halfpel_flag, h->thirdpel_flag,
+               s->adaptive_quant, s->qscale, h->slice_num);
     }
-  }
-
-  /* special case for last picture */
-  if (buf_size == 0) {
-    if (s->next_picture_ptr && !s->low_delay) {
-      *(AVFrame *) data = *(AVFrame *) &s->next_picture;
-      s->next_picture_ptr= NULL;
-      *data_size = sizeof(AVFrame);
+
+    /* for hurry_up == 5 */
+    s->current_picture.pict_type = s->pict_type;
+    s->current_picture.key_frame = (s->pict_type == FF_I_TYPE);
+
+    /* Skip B-frames if we do not have reference frames. */
+    if (s->last_picture_ptr == NULL && s->pict_type == FF_B_TYPE)
+        return 0;
+    /* Skip B-frames if we are in a hurry. */
+    if (avctx->hurry_up && s->pict_type == FF_B_TYPE)
+        return 0;
+    /* Skip everything if we are in a hurry >= 5. */
+    if (avctx->hurry_up >= 5)
+        return 0;
+    if (  (avctx->skip_frame >= AVDISCARD_NONREF && s->pict_type == FF_B_TYPE)
+        ||(avctx->skip_frame >= AVDISCARD_NONKEY && s->pict_type != FF_I_TYPE)
+        || avctx->skip_frame >= AVDISCARD_ALL)
+        return 0;
+
+    if (s->next_p_frame_damaged) {
+        if (s->pict_type == FF_B_TYPE)
+            return 0;
+        else
+            s->next_p_frame_damaged = 0;
     }
-    return 0;
-  }
-
-  init_get_bits (&s->gb, buf, 8*buf_size);
-
-  s->mb_x = s->mb_y = h->mb_xy = 0;
-
-  if (svq3_decode_slice_header (h))
-    return -1;
-
-  s->pict_type = h->slice_type;
-  s->picture_number = h->slice_num;
-
-  if(avctx->debug&FF_DEBUG_PICT_INFO){
-      av_log(h->s.avctx, AV_LOG_DEBUG, "%c hpel:%d, tpel:%d aqp:%d qp:%d, slice_num:%02X\n",
-      av_get_pict_type_char(s->pict_type), h->halfpel_flag, h->thirdpel_flag,
-      s->adaptive_quant, s->qscale, h->slice_num
-      );
-  }
-
-  /* for hurry_up==5 */
-  s->current_picture.pict_type = s->pict_type;
-  s->current_picture.key_frame = (s->pict_type == FF_I_TYPE);
-
-  /* Skip B-frames if we do not have reference frames. */
-  if (s->last_picture_ptr == NULL && s->pict_type == FF_B_TYPE) return 0;
-  /* Skip B-frames if we are in a hurry. */
-  if (avctx->hurry_up && s->pict_type == FF_B_TYPE) return 0;
-  /* Skip everything if we are in a hurry >= 5. */
-  if (avctx->hurry_up >= 5) return 0;
-  if(  (avctx->skip_frame >= AVDISCARD_NONREF && s->pict_type==FF_B_TYPE)
-     ||(avctx->skip_frame >= AVDISCARD_NONKEY && s->pict_type!=FF_I_TYPE)
-     || avctx->skip_frame >= AVDISCARD_ALL)
-      return 0;
-
-  if (s->next_p_frame_damaged) {
-    if (s->pict_type == FF_B_TYPE)
-      return 0;
-    else
-      s->next_p_frame_damaged = 0;
-  }
 
-  if (frame_start (h) < 0)
-    return -1;
+    if (frame_start(h) < 0)
+        return -1;
+
+    if (s->pict_type == FF_B_TYPE) {
+        h->frame_num_offset = (h->slice_num - h->prev_frame_num);
 
-  if (s->pict_type == FF_B_TYPE) {
-    h->frame_num_offset = (h->slice_num - h->prev_frame_num);
+        if (h->frame_num_offset < 0) {
+            h->frame_num_offset += 256;
+        }
+        if (h->frame_num_offset == 0 || h->frame_num_offset >= h->prev_frame_num_offset) {
+            av_log(h->s.avctx, AV_LOG_ERROR, "error in B-frame picture id\n");
+            return -1;
+        }
+    } else {
+        h->prev_frame_num = h->frame_num;
+        h->frame_num = h->slice_num;
+        h->prev_frame_num_offset = (h->frame_num - h->prev_frame_num);
 
-    if (h->frame_num_offset < 0) {
-      h->frame_num_offset += 256;
-    }
-    if (h->frame_num_offset == 0 || h->frame_num_offset >= h->prev_frame_num_offset) {
-      av_log(h->s.avctx, AV_LOG_ERROR, "error in B-frame picture id\n");
-      return -1;
+        if (h->prev_frame_num_offset < 0) {
+            h->prev_frame_num_offset += 256;
+        }
     }
-  } else {
-    h->prev_frame_num = h->frame_num;
-    h->frame_num = h->slice_num;
-    h->prev_frame_num_offset = (h->frame_num - h->prev_frame_num);
 
-    if (h->prev_frame_num_offset < 0) {
-      h->prev_frame_num_offset += 256;
+    for (m = 0; m < 2; m++){
+        int i;
+        for (i = 0; i < 4; i++){
+            int j;
+            for (j = -1; j < 4; j++)
+                h->ref_cache[m][scan8[0] + 8*i + j]= 1;
+            if (i < 3)
+                h->ref_cache[m][scan8[0] + 8*i + j]= PART_NOT_AVAILABLE;
+        }
     }
-  }
 
-  for(m=0; m<2; m++){
-    int i;
-    for(i=0; i<4; i++){
-      int j;
-      for(j=-1; j<4; j++)
-        h->ref_cache[m][scan8[0] + 8*i + j]= 1;
-      if(i<3)
-        h->ref_cache[m][scan8[0] + 8*i + j]= PART_NOT_AVAILABLE;
-    }
-  }
+    for (s->mb_y = 0; s->mb_y < s->mb_height; s->mb_y++) {
+        for (s->mb_x = 0; s->mb_x < s->mb_width; s->mb_x++) {
+            h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
 
-  for (s->mb_y=0; s->mb_y < s->mb_height; s->mb_y++) {
-    for (s->mb_x=0; s->mb_x < s->mb_width; s->mb_x++) {
-      h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
+            if ( (get_bits_count(&s->gb) + 7) >= s->gb.size_in_bits &&
+                ((get_bits_count(&s->gb) & 7) == 0 || show_bits(&s->gb, (-get_bits_count(&s->gb) & 7)) == 0)) {
 
-      if ( (get_bits_count(&s->gb) + 7) >= s->gb.size_in_bits &&
-          ((get_bits_count(&s->gb) & 7) == 0 || show_bits (&s->gb, (-get_bits_count(&s->gb) & 7)) == 0)) {
+                skip_bits(&s->gb, h->next_slice_index - get_bits_count(&s->gb));
+                s->gb.size_in_bits = 8*buf_size;
 
-        skip_bits(&s->gb, h->next_slice_index - get_bits_count(&s->gb));
-        s->gb.size_in_bits = 8*buf_size;
+                if (svq3_decode_slice_header(h))
+                    return -1;
 
-        if (svq3_decode_slice_header (h))
-          return -1;
+                /* TODO: support s->mb_skip_run */
+            }
 
-        /* TODO: support s->mb_skip_run */
-      }
+            mb_type = svq3_get_ue_golomb(&s->gb);
 
-      mb_type = svq3_get_ue_golomb (&s->gb);
+            if (s->pict_type == FF_I_TYPE) {
+                mb_type += 8;
+            } else if (s->pict_type == FF_B_TYPE && mb_type >= 4) {
+                mb_type += 4;
+            }
+            if (mb_type > 33 || svq3_decode_mb(h, mb_type)) {
+                av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
+                return -1;
+            }
 
-      if (s->pict_type == FF_I_TYPE) {
-        mb_type += 8;
-      } else if (s->pict_type == FF_B_TYPE && mb_type >= 4) {
-        mb_type += 4;
-      }
-      if (mb_type > 33 || svq3_decode_mb (h, mb_type)) {
-        av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
-        return -1;
-      }
+            if (mb_type != 0) {
+                hl_decode_mb (h);
+            }
 
-      if (mb_type != 0) {
-        hl_decode_mb (h);
-      }
+            if (s->pict_type != FF_B_TYPE && !s->low_delay) {
+                s->current_picture.mb_type[s->mb_x + s->mb_y*s->mb_stride] =
+                    (s->pict_type == FF_P_TYPE && mb_type < 8) ? (mb_type - 1) : -1;
+            }
+        }
 
-      if (s->pict_type != FF_B_TYPE && !s->low_delay) {
-        s->current_picture.mb_type[s->mb_x + s->mb_y*s->mb_stride] =
-                        (s->pict_type == FF_P_TYPE && mb_type < 8) ? (mb_type - 1) : -1;
-      }
+        ff_draw_horiz_band(s, 16*s->mb_y, 16);
     }
 
-    ff_draw_horiz_band(s, 16*s->mb_y, 16);
-  }
+    MPV_frame_end(s);
 
-  MPV_frame_end(s);
-
-  if (s->pict_type == FF_B_TYPE || s->low_delay) {
-    *(AVFrame *) data = *(AVFrame *) &s->current_picture;
-  } else {
-    *(AVFrame *) data = *(AVFrame *) &s->last_picture;
-  }
+    if (s->pict_type == FF_B_TYPE || s->low_delay) {
+        *(AVFrame *) data = *(AVFrame *) &s->current_picture;
+    } else {
+        *(AVFrame *) data = *(AVFrame *) &s->last_picture;
+    }
 
-  avctx->frame_number = s->picture_number - 1;
+    avctx->frame_number = s->picture_number - 1;
 
-  /* Do not output the last pic after seeking. */
-  if (s->last_picture_ptr || s->low_delay) {
-    *data_size = sizeof(AVFrame);
-  }
+    /* Do not output the last pic after seeking. */
+    if (s->last_picture_ptr || s->low_delay) {
+        *data_size = sizeof(AVFrame);
+    }
 
-  return buf_size;
+    return buf_size;
 }
 
 
@@ -1021,7 +1044,7 @@ AVCodec svq3_decoder = {
     CODEC_TYPE_VIDEO,
     CODEC_ID_SVQ3,
     sizeof(H264Context),
-    decode_init,
+    svq3_decode_init,
     NULL,
     decode_end,
     svq3_decode_frame,
diff --git a/libavcodec/tiff.c b/libavcodec/tiff.c
index 7e42aab..582076b 100644
--- a/libavcodec/tiff.c
+++ b/libavcodec/tiff.c
@@ -30,6 +30,7 @@
 #endif
 #include "lzw.h"
 #include "tiff.h"
+#include "faxcompr.h"
 
 
 typedef struct TiffContext {
@@ -41,8 +42,10 @@ typedef struct TiffContext {
     int le;
     int compr;
     int invert;
+    int fax_opts;
+    int predictor;
 
-    int strips, rps;
+    int strips, rps, sstype;
     int sot;
     const uint8_t* stripdata;
     const uint8_t* stripsizes;
@@ -74,7 +77,7 @@ static int tget(const uint8_t **p, int type, int le){
 static int tiff_unpack_strip(TiffContext *s, uint8_t* dst, int stride, const uint8_t *src, int size, int lines){
     int c, line, pixels, code;
     const uint8_t *ssrc = src;
-    int width = s->width * (s->bpp / 8);
+    int width = s->width * s->bpp >> 3;
 #ifdef CONFIG_ZLIB
     uint8_t *zbuf; unsigned long outlen;
 
@@ -102,6 +105,29 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t* dst, int stride, const uin
             return -1;
         }
     }
+    if(s->compr == TIFF_CCITT_RLE || s->compr == TIFF_G3 || s->compr == TIFF_G4){
+        int i, ret = 0;
+        uint8_t *src2 = av_malloc(size + FF_INPUT_BUFFER_PADDING_SIZE);
+
+        if(!src2 || (unsigned)size + FF_INPUT_BUFFER_PADDING_SIZE < (unsigned)size){
+            av_log(s->avctx, AV_LOG_ERROR, "Error allocating temporary buffer\n");
+            return -1;
+        }
+        for(i = 0; i < size; i++)
+            src2[i] = ff_reverse[src[i]];
+        memset(src2+size, 0, FF_INPUT_BUFFER_PADDING_SIZE);
+        if(s->compr == TIFF_G3 && !(s->fax_opts & 1))
+            s->compr = TIFF_CCITT_RLE;
+        switch(s->compr){
+        case TIFF_CCITT_RLE:
+        case TIFF_G3:
+        case TIFF_G4:
+            ret = ff_ccitt_unpack(s->avctx, src2, size, dst, lines, stride, s->compr);
+            break;
+        }
+        av_free(src2);
+        return ret;
+    }
     for(line = 0; line < lines; line++){
         if(src - ssrc > size){
             av_log(s->avctx, AV_LOG_ERROR, "Source data overread\n");
@@ -109,8 +135,8 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t* dst, int stride, const uin
         }
         switch(s->compr){
         case TIFF_RAW:
-            memcpy(dst, src, s->width * (s->bpp / 8));
-            src += s->width * (s->bpp / 8);
+            memcpy(dst, src, width);
+            src += width;
             break;
         case TIFF_PACKBITS:
             for(pixels = 0; pixels < width;){
@@ -150,12 +176,10 @@ static int tiff_unpack_strip(TiffContext *s, uint8_t* dst, int stride, const uin
 }
 
 
-static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *buf, const uint8_t *end_buf, AVFrame *pic)
+static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *buf, const uint8_t *end_buf)
 {
     int tag, type, count, off, value = 0;
-    const uint8_t *src;
-    uint8_t *dst;
-    int i, j, ssize, soff, stride;
+    int i, j;
     uint32_t *pal;
     const uint8_t *rp, *gp, *bp;
 
@@ -176,6 +200,11 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
             value = off;
             buf = NULL;
             break;
+        case TIFF_STRING:
+            if(count <= 4){
+                buf -= 4;
+                break;
+            }
         default:
             value = -1;
             buf = start + off;
@@ -215,6 +244,9 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
             }
         }
         switch(s->bpp){
+        case 1:
+            s->avctx->pix_fmt = PIX_FMT_MONOBLACK;
+            break;
         case 8:
             s->avctx->pix_fmt = PIX_FMT_PAL8;
             break;
@@ -253,10 +285,16 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
         break;
     case TIFF_COMPR:
         s->compr = value;
+        s->predictor = 0;
         switch(s->compr){
         case TIFF_RAW:
         case TIFF_PACKBITS:
         case TIFF_LZW:
+        case TIFF_CCITT_RLE:
+            break;
+        case TIFF_G3:
+        case TIFF_G4:
+            s->fax_opts = 0;
             break;
         case TIFF_DEFLATE:
         case TIFF_ADOBE_DEFLATE:
@@ -266,15 +304,6 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
             av_log(s->avctx, AV_LOG_ERROR, "Deflate: ZLib not compiled in\n");
             return -1;
 #endif
-        case TIFF_G3:
-            av_log(s->avctx, AV_LOG_ERROR, "CCITT G3 compression is not supported\n");
-            return -1;
-        case TIFF_G4:
-            av_log(s->avctx, AV_LOG_ERROR, "CCITT G4 compression is not supported\n");
-            return -1;
-        case TIFF_CCITT_RLE:
-            av_log(s->avctx, AV_LOG_ERROR, "CCITT RLE compression is not supported\n");
-            return -1;
         case TIFF_JPEG:
         case TIFF_NEWJPEG:
             av_log(s->avctx, AV_LOG_ERROR, "JPEG compression is not supported\n");
@@ -285,6 +314,8 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
         }
         break;
     case TIFF_ROWSPERSTRIP:
+        if(type == TIFF_LONG && value == -1)
+            value = s->avctx->height;
         if(value < 1){
             av_log(s->avctx, AV_LOG_ERROR, "Incorrect value of rows per strip\n");
             return -1;
@@ -314,49 +345,14 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
             s->stripsizes = start + off;
         }
         s->strips = count;
+        s->sstype = type;
         if(s->stripsizes > end_buf){
             av_log(s->avctx, AV_LOG_ERROR, "Tag referencing position outside the image\n");
             return -1;
         }
-        if(!pic->data[0]){
-            av_log(s->avctx, AV_LOG_ERROR, "Picture initialization missing\n");
-            return -1;
-        }
-        /* now we have the data and may start decoding */
-        stride = pic->linesize[0];
-        dst = pic->data[0];
-        for(i = 0; i < s->height; i += s->rps){
-            if(s->stripsizes)
-                ssize = tget(&s->stripsizes, type, s->le);
-            else
-                ssize = s->stripsize;
-
-            if(s->stripdata){
-                soff = tget(&s->stripdata, s->sot, s->le);
-            }else
-                soff = s->stripoff;
-            src = start + soff;
-            if(tiff_unpack_strip(s, dst, stride, src, ssize, FFMIN(s->rps, s->height - i)) < 0)
-                break;
-            dst += s->rps * stride;
-        }
         break;
     case TIFF_PREDICTOR:
-        if(!pic->data[0]){
-            av_log(s->avctx, AV_LOG_ERROR, "Picture initialization missing\n");
-            return -1;
-        }
-        if(value == 2){
-            dst = pic->data[0];
-            stride = pic->linesize[0];
-            soff = s->bpp >> 3;
-            ssize = s->width * soff;
-            for(i = 0; i < s->height; i++) {
-                for(j = soff; j < ssize; j++)
-                    dst[j] += dst[j - soff];
-                dst += stride;
-            }
-        }
+        s->predictor = value;
         break;
     case TIFF_INVERT:
         switch(value){
@@ -398,6 +394,10 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
             return -1;
         }
         break;
+    case TIFF_T4OPTIONS:
+    case TIFF_T6OPTIONS:
+        s->fax_opts = value;
+        break;
     }
     return 0;
 }
@@ -411,7 +411,9 @@ static int decode_frame(AVCodecContext *avctx,
     AVFrame * const p= (AVFrame*)&s->picture;
     const uint8_t *orig_buf = buf, *end_buf = buf + buf_size;
     int id, le, off;
-    int i, entries;
+    int i, j, entries;
+    int stride, soff, ssize;
+    uint8_t *dst;
 
     //parse image header
     id = AV_RL16(buf); buf += 2;
@@ -439,10 +441,49 @@ static int decode_frame(AVCodecContext *avctx,
     buf = orig_buf + off;
     entries = tget_short(&buf, le);
     for(i = 0; i < entries; i++){
-        if(tiff_decode_tag(s, orig_buf, buf, end_buf, p) < 0)
+        if(tiff_decode_tag(s, orig_buf, buf, end_buf) < 0)
             return -1;
         buf += 12;
     }
+    if(!s->stripdata && !s->stripoff){
+        av_log(avctx, AV_LOG_ERROR, "Image data is missing\n");
+        return -1;
+    }
+    /* now we have the data and may start decoding */
+    if(!p->data[0]){
+        av_log(s->avctx, AV_LOG_ERROR, "Picture initialization missing\n");
+        return -1;
+    }
+    if(s->strips == 1 && !s->stripsize){
+        av_log(avctx, AV_LOG_WARNING, "Image data size missing\n");
+        s->stripsize = buf_size - s->stripoff;
+    }
+    stride = p->linesize[0];
+    dst = p->data[0];
+    for(i = 0; i < s->height; i += s->rps){
+        if(s->stripsizes)
+            ssize = tget(&s->stripsizes, s->sstype, s->le);
+        else
+            ssize = s->stripsize;
+
+        if(s->stripdata){
+            soff = tget(&s->stripdata, s->sot, s->le);
+        }else
+            soff = s->stripoff;
+        if(tiff_unpack_strip(s, dst, stride, orig_buf + soff, ssize, FFMIN(s->rps, s->height - i)) < 0)
+            break;
+        dst += s->rps * stride;
+    }
+    if(s->predictor == 2){
+        dst = p->data[0];
+        soff = s->bpp >> 3;
+        ssize = s->width * soff;
+        for(i = 0; i < s->height; i++) {
+            for(j = soff; j < ssize; j++)
+                dst[j] += dst[j - soff];
+            dst += stride;
+        }
+    }
 
     if(s->invert){
         uint8_t *src;
@@ -471,6 +512,7 @@ static av_cold int tiff_init(AVCodecContext *avctx){
     avctx->coded_frame= (AVFrame*)&s->picture;
     s->picture.data[0] = NULL;
     ff_lzw_decode_open(&s->lzw);
+    ff_ccitt_unpack_init();
 
     return 0;
 }
diff --git a/libavcodec/tiff.h b/libavcodec/tiff.h
index 1c51592..dfabe29 100644
--- a/libavcodec/tiff.h
+++ b/libavcodec/tiff.h
@@ -46,6 +46,8 @@ enum TiffTags{
     TIFF_PLANAR = 0x11C,
     TIFF_XPOS = 0x11E,
     TIFF_YPOS = 0x11F,
+    TIFF_T4OPTIONS = 0x124,
+    TIFF_T6OPTIONS,
     TIFF_RES_UNIT = 0x128,
     TIFF_SOFTWARE_NAME = 0x131,
     TIFF_PREDICTOR = 0x13D,
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index 2f162bc..dcd7602 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -87,13 +87,14 @@ AVCodec *av_codec_next(AVCodec *c){
     else  return first_avcodec;
 }
 
-void register_avcodec(AVCodec *format)
+void register_avcodec(AVCodec *codec)
 {
     AVCodec **p;
+    avcodec_init();
     p = &first_avcodec;
     while (*p != NULL) p = &(*p)->next;
-    *p = format;
-    format->next = NULL;
+    *p = codec;
+    codec->next = NULL;
 }
 
 void avcodec_set_dimensions(AVCodecContext *s, int width, int height){
@@ -563,6 +564,7 @@ static const AVOption options[]={
 {"simplearm", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARM, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"simplearmv5te", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV5TE, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"simplearmv6", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV6, INT_MIN, INT_MAX, V|E|D, "idct"},
+{"simpleneon", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLENEON, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"h264", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_H264, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"vp3", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_VP3, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"ipp", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_IPP, INT_MIN, INT_MAX, V|E|D, "idct"},
@@ -743,6 +745,8 @@ static const AVOption options[]={
 {"bits_per_raw_sample", NULL, OFFSET(bits_per_raw_sample), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"channel_layout", NULL, OFFSET(channel_layout), FF_OPT_TYPE_INT64, DEFAULT, 0, INT64_MAX, A|E|D, "channel_layout"},
 {"request_channel_layout", NULL, OFFSET(request_channel_layout), FF_OPT_TYPE_INT64, DEFAULT, 0, INT64_MAX, A|D, "request_channel_layout"},
+{"rc_max_vbv_use", NULL, OFFSET(rc_max_available_vbv_use), FF_OPT_TYPE_FLOAT, 1.0/3, 0.0, FLT_MAX, V|E},
+{"rc_min_vbv_use", NULL, OFFSET(rc_min_vbv_overflow_use),  FF_OPT_TYPE_FLOAT, 3,     0.0, FLT_MAX, V|E},
 {NULL},
 };
 
@@ -1506,7 +1510,7 @@ int av_parse_video_frame_rate(AVRational *frame_rate, const char *arg)
         return 0;
 }
 
-void av_log_missing_feature(void *avc, const char *feature, int want_sample)
+void ff_log_missing_feature(void *avc, const char *feature, int want_sample)
 {
     av_log(avc, AV_LOG_WARNING, "%s not implemented. Update your FFmpeg "
             "version to the newest one from SVN. If the problem still "
@@ -1514,7 +1518,7 @@ void av_log_missing_feature(void *avc, const char *feature, int want_sample)
             "been implemented.", feature);
     if(want_sample)
         av_log(avc, AV_LOG_WARNING, " If you want to help, upload a sample "
-                "of this file to ftp://upload.mplayerhq.hu/MPlayer/incoming/ "
-                "and contact the FFmpeg-devel mailing list.");
+                "of this file to ftp://upload.ffmpeg.org/MPlayer/incoming/ "
+                "and contact the ffmpeg-devel mailing list.");
     av_log(avc, AV_LOG_WARNING, "\n");
 }
diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
index 375c20a..62a0a97 100644
--- a/libavcodec/vc1.c
+++ b/libavcodec/vc1.c
@@ -1059,13 +1059,13 @@ static int decode_sequence_header_adv(VC1Context *v, GetBitContext *gb)
 static int decode_entry_point(AVCodecContext *avctx, GetBitContext *gb)
 {
     VC1Context *v = avctx->priv_data;
-    int i, blink, clentry, refdist;
+    int i, blink, clentry;
 
     av_log(avctx, AV_LOG_DEBUG, "Entry point: %08X\n", show_bits_long(gb, 32));
     blink = get_bits1(gb); // broken link
     clentry = get_bits1(gb); // closed entry
     v->panscanflag = get_bits1(gb);
-    refdist = get_bits1(gb); // refdist flag
+    v->refdist_flag = get_bits1(gb);
     v->s.loop_filter = get_bits1(gb);
     v->fastuvmc = get_bits1(gb);
     v->extended_mv = get_bits1(gb);
@@ -1099,7 +1099,7 @@ static int decode_entry_point(AVCodecContext *avctx, GetBitContext *gb)
         "BrokenLink=%i, ClosedEntry=%i, PanscanFlag=%i\n"
         "RefDist=%i, Postproc=%i, FastUVMC=%i, ExtMV=%i\n"
         "DQuant=%i, VSTransform=%i, Overlap=%i, Qmode=%i\n",
-        blink, clentry, v->panscanflag, refdist, v->s.loop_filter,
+        blink, clentry, v->panscanflag, v->refdist_flag, v->s.loop_filter,
         v->fastuvmc, v->extended_mv, v->dquant, v->vstransform, v->overlap, v->quantizer_mode);
 
     return 0;
@@ -1394,6 +1394,8 @@ static int vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
     else v->halfpq = 0;
     if (v->quantizer_mode == QUANT_FRAME_EXPLICIT)
         v->pquantizer = get_bits1(gb);
+    if(v->postprocflag)
+        v->postproc = get_bits1(gb);
 
     if(v->s.pict_type == FF_I_TYPE || v->s.pict_type == FF_P_TYPE) v->use_ic = 0;
 
@@ -1416,8 +1418,6 @@ static int vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         }
         break;
     case FF_P_TYPE:
-        if(v->postprocflag)
-            v->postproc = get_bits1(gb);
         if (v->extended_mv) v->mvrange = get_unary(gb, 0, 3);
         else v->mvrange = 0;
         v->k_x = v->mvrange + 9 + (v->mvrange >> 1); //k_x can be 9 10 12 13
@@ -1507,8 +1507,6 @@ static int vc1_parse_frame_header_adv(VC1Context *v, GetBitContext* gb)
         }
         break;
     case FF_B_TYPE:
-        if(v->postprocflag)
-            v->postproc = get_bits1(gb);
         if (v->extended_mv) v->mvrange = get_unary(gb, 0, 3);
         else v->mvrange = 0;
         v->k_x = v->mvrange + 9 + (v->mvrange >> 1); //k_x can be 9 10 12 13
diff --git a/libavcodec/vc1.h b/libavcodec/vc1.h
index 4cd7bb1..212842a 100644
--- a/libavcodec/vc1.h
+++ b/libavcodec/vc1.h
@@ -180,6 +180,7 @@ typedef struct VC1Context{
     int interlace;        ///< Progressive/interlaced (RPTFTM syntax element)
     int tfcntrflag;       ///< TFCNTR present
     int panscanflag;      ///< NUMPANSCANWIN, TOPLEFT{X,Y}, BOTRIGHT{X,Y} present
+    int refdist_flag;     ///< REFDIST syntax element present in II, IP, PI or PP field picture headers
     int extended_dmv;     ///< Additional extended dmv range at P/B frame-level
     int color_prim;       ///< 8bits, chroma coordinates of the color primaries
     int transfer_char;    ///< 8bits, Opto-electronic transfer characteristics
diff --git a/libavcodec/vdpau.h b/libavcodec/vdpau.h
new file mode 100644
index 0000000..2d03356
--- /dev/null
+++ b/libavcodec/vdpau.h
@@ -0,0 +1,84 @@
+/*
+ * The Video Decode and Presentation API for UNIX (VDPAU) is used for
+ * hardware-accelerated decoding of MPEG-1/2, H.264 and VC-1.
+ *
+ * Copyright (C) 2008 NVIDIA
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VDPAU_H
+#define AVCODEC_VDPAU_H
+
+/**
+ * \defgroup Decoder VDPAU Decoder and Renderer
+ *
+ * VDPAU hardware acceleration has two modules
+ * - VDPAU decoding
+ * - VDPAU presentation
+ *
+ * The VDPAU decoding module parses all headers using FFmpeg
+ * parsing mechanisms and uses VDPAU for the actual decoding.
+ *
+ * As per the current implementation, the actual decoding
+ * and rendering (API calls) are done as part of the VDPAU
+ * presentation (vo_vdpau.c) module.
+ *
+ * @{
+ * \defgroup  VDPAU_Decoding VDPAU Decoding
+ * \ingroup Decoder
+ * @{
+ */
+
+#include <vdpau/vdpau.h>
+#include <vdpau/vdpau_x11.h>
+
+/** \brief The videoSurface is used for rendering. */
+#define FF_VDPAU_STATE_USED_FOR_RENDER 1
+
+/**
+ * \brief The videoSurface is needed for reference/prediction.
+ * The codec manipulates this.
+ */
+#define FF_VDPAU_STATE_USED_FOR_REFERENCE 2
+
+/**
+ * \brief This structure is used as a callback between the FFmpeg
+ * decoder (vd_) and presentation (vo_) module.
+ * This is used for defining a video frame containing surface,
+ * picture parameter, bitstream information etc which are passed
+ * between the FFmpeg decoder and its clients.
+ */
+struct vdpau_render_state {
+    VdpVideoSurface surface; ///< Used as rendered surface, never changed.
+
+    int state; ///< Holds FF_VDPAU_STATE_* values.
+
+    /** picture parameter information for all supported codecs */
+    union VdpPictureInfo {
+        VdpPictureInfoH264     h264;
+    } info;
+
+    /** Describe size/location of the compressed video data. */
+    int bitstream_buffers_allocated;
+    int bitstream_buffers_used;
+    VdpBitstreamBuffer *bitstream_buffers;
+};
+
+/* @}*/
+
+#endif /* AVCODEC_VDPAU_H */
diff --git a/libavcodec/vdpau_internal.h b/libavcodec/vdpau_internal.h
new file mode 100644
index 0000000..6af2495
--- /dev/null
+++ b/libavcodec/vdpau_internal.h
@@ -0,0 +1,34 @@
+/*
+ * Video Decode and Presentation API for UNIX (VDPAU) is used for
+ * HW decode acceleration for MPEG-1/2, H.264 and VC-1.
+ *
+ * Copyright (C) 2008 NVIDIA.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VDPAU_INTERNAL_H
+#define AVCODEC_VDPAU_INTERNAL_H
+
+#include <stdint.h>
+#include "h264.h"
+
+void ff_vdpau_h264_add_data_chunk(H264Context *h, const uint8_t *buf,
+                                  int buf_size);
+void ff_vdpau_h264_picture_complete(H264Context *h);
+
+#endif /* AVCODEC_VDPAU_INTERNAL_H */
diff --git a/libavcodec/vdpauvideo.c b/libavcodec/vdpauvideo.c
new file mode 100644
index 0000000..8b42823
--- /dev/null
+++ b/libavcodec/vdpauvideo.c
@@ -0,0 +1,183 @@
+/*
+ * Video Decode and Presentation API for UNIX (VDPAU) is used for
+ * HW decode acceleration for MPEG-1/2, H.264 and VC-1.
+ *
+ * Copyright (c) 2008 NVIDIA.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <limits.h>
+#include "avcodec.h"
+#include "h264.h"
+
+#undef NDEBUG
+#include <assert.h>
+
+#include "vdpau.h"
+#include "vdpau_internal.h"
+
+/**
+ * \addtogroup VDPAU_Decoding
+ *
+ * @{
+ */
+
+static void vdpau_h264_set_reference_frames(H264Context *h)
+{
+    MpegEncContext * s = &h->s;
+    struct vdpau_render_state * render, * render_ref;
+    VdpReferenceFrameH264 * rf, * rf2;
+    Picture * pic;
+    int i, list, pic_frame_idx;
+
+    render = (struct vdpau_render_state*)s->current_picture_ptr->data[0];
+    assert(render);
+
+    rf = &render->info.h264.referenceFrames[0];
+#define H264_RF_COUNT FF_ARRAY_ELEMS(render->info.h264.referenceFrames)
+
+    for (list = 0; list < 2; ++list) {
+        Picture **lp = list ? h->long_ref : h->short_ref;
+        int ls = list ? h->long_ref_count : h->short_ref_count;
+
+        for (i = 0; i < ls; ++i) {
+            pic = lp[i];
+            if (!pic || !pic->reference)
+                continue;
+            pic_frame_idx = pic->long_ref ? pic->pic_id : pic->frame_num;
+
+            render_ref = (struct vdpau_render_state*)pic->data[0];
+            assert(render_ref);
+
+            rf2 = &render->info.h264.referenceFrames[0];
+            while (rf2 != rf) {
+                if (
+                    (rf2->surface == render_ref->surface)
+                    && (rf2->is_long_term == pic->long_ref)
+                    && (rf2->frame_idx == pic_frame_idx)
+                )
+                    break;
+                ++rf2;
+            }
+            if (rf2 != rf) {
+                rf2->top_is_reference    |= (pic->reference & PICT_TOP_FIELD)    ? VDP_TRUE : VDP_FALSE;
+                rf2->bottom_is_reference |= (pic->reference & PICT_BOTTOM_FIELD) ? VDP_TRUE : VDP_FALSE;
+                continue;
+            }
+
+            if (rf >= &render->info.h264.referenceFrames[H264_RF_COUNT])
+                continue;
+
+            rf->surface             = render_ref->surface;
+            rf->is_long_term        = pic->long_ref;
+            rf->top_is_reference    = (pic->reference & PICT_TOP_FIELD)    ? VDP_TRUE : VDP_FALSE;
+            rf->bottom_is_reference = (pic->reference & PICT_BOTTOM_FIELD) ? VDP_TRUE : VDP_FALSE;
+            rf->field_order_cnt[0]  = pic->field_poc[0];
+            rf->field_order_cnt[1]  = pic->field_poc[1];
+            rf->frame_idx           = pic_frame_idx;
+
+            ++rf;
+        }
+    }
+
+    for (; rf < &render->info.h264.referenceFrames[H264_RF_COUNT]; ++rf) {
+        rf->surface             = VDP_INVALID_HANDLE;
+        rf->is_long_term        = 0;
+        rf->top_is_reference    = 0;
+        rf->bottom_is_reference = 0;
+        rf->field_order_cnt[0]  = 0;
+        rf->field_order_cnt[1]  = 0;
+        rf->frame_idx           = 0;
+    }
+}
+
+void ff_vdpau_h264_add_data_chunk(H264Context *h, const uint8_t *buf, int buf_size)
+{
+    MpegEncContext * s = &h->s;
+    struct vdpau_render_state * render;
+
+    render = (struct vdpau_render_state*)s->current_picture_ptr->data[0];
+    assert(render);
+
+    if (!render->bitstream_buffers_used)
+        vdpau_h264_set_reference_frames(h);
+
+    render->bitstream_buffers= av_fast_realloc(
+        render->bitstream_buffers,
+        &render->bitstream_buffers_allocated,
+        sizeof(*render->bitstream_buffers)*(render->bitstream_buffers_used + 1)
+    );
+
+    render->bitstream_buffers[render->bitstream_buffers_used].struct_version  = VDP_BITSTREAM_BUFFER_VERSION;
+    render->bitstream_buffers[render->bitstream_buffers_used].bitstream       = buf;
+    render->bitstream_buffers[render->bitstream_buffers_used].bitstream_bytes = buf_size;
+    render->bitstream_buffers_used++;
+}
+
+void ff_vdpau_h264_picture_complete(H264Context *h)
+{
+    MpegEncContext * s = &h->s;
+    struct vdpau_render_state * render;
+
+    render = (struct vdpau_render_state*)s->current_picture_ptr->data[0];
+    assert(render);
+
+    render->info.h264.slice_count = h->slice_num;
+    if (render->info.h264.slice_count < 1)
+        return;
+
+    for (int i = 0; i < 2; ++i) {
+        int foc = s->current_picture_ptr->field_poc[i];
+        if (foc == INT_MAX)
+            foc = 0;
+        render->info.h264.field_order_cnt[i] = foc;
+    }
+
+    render->info.h264.is_reference                           = s->current_picture_ptr->reference ? VDP_TRUE : VDP_FALSE;
+    render->info.h264.frame_num                              = h->frame_num;
+    render->info.h264.field_pic_flag                         = s->picture_structure != PICT_FRAME;
+    render->info.h264.bottom_field_flag                      = s->picture_structure == PICT_BOTTOM_FIELD;
+    render->info.h264.num_ref_frames                         = h->sps.ref_frame_count;
+    render->info.h264.mb_adaptive_frame_field_flag           = h->sps.mb_aff;
+    render->info.h264.constrained_intra_pred_flag            = h->pps.constrained_intra_pred;
+    render->info.h264.weighted_pred_flag                     = h->pps.weighted_pred;
+    render->info.h264.weighted_bipred_idc                    = h->pps.weighted_bipred_idc;
+    render->info.h264.frame_mbs_only_flag                    = h->sps.frame_mbs_only_flag;
+    render->info.h264.transform_8x8_mode_flag                = h->pps.transform_8x8_mode;
+    render->info.h264.chroma_qp_index_offset                 = h->pps.chroma_qp_index_offset[0];
+    render->info.h264.second_chroma_qp_index_offset          = h->pps.chroma_qp_index_offset[1];
+    render->info.h264.pic_init_qp_minus26                    = h->pps.init_qp - 26;
+    render->info.h264.num_ref_idx_l0_active_minus1           = h->pps.ref_count[0] - 1;
+    render->info.h264.num_ref_idx_l1_active_minus1           = h->pps.ref_count[1] - 1;
+    render->info.h264.log2_max_frame_num_minus4              = h->sps.log2_max_frame_num - 4;
+    render->info.h264.pic_order_cnt_type                     = h->sps.poc_type;
+    render->info.h264.log2_max_pic_order_cnt_lsb_minus4      = h->sps.log2_max_poc_lsb - 4;
+    render->info.h264.delta_pic_order_always_zero_flag       = h->sps.delta_pic_order_always_zero_flag;
+    render->info.h264.direct_8x8_inference_flag              = h->sps.direct_8x8_inference_flag;
+    render->info.h264.entropy_coding_mode_flag               = h->pps.cabac;
+    render->info.h264.pic_order_present_flag                 = h->pps.pic_order_present;
+    render->info.h264.deblocking_filter_control_present_flag = h->pps.deblocking_filter_parameters_present;
+    render->info.h264.redundant_pic_cnt_present_flag         = h->pps.redundant_pic_cnt_present;
+    memcpy(render->info.h264.scaling_lists_4x4, h->pps.scaling_matrix4, sizeof(render->info.h264.scaling_lists_4x4));
+    memcpy(render->info.h264.scaling_lists_8x8, h->pps.scaling_matrix8, sizeof(render->info.h264.scaling_lists_8x8));
+
+    ff_draw_horiz_band(s, 0, s->avctx->height);
+    render->bitstream_buffers_used = 0;
+}
+
+/* @}*/
diff --git a/libavcodec/vorbis.c b/libavcodec/vorbis.c
index f39380a..b6243b8 100644
--- a/libavcodec/vorbis.c
+++ b/libavcodec/vorbis.c
@@ -111,7 +111,7 @@ int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, uint_fast32_t num) {
     return 0;
 }
 
-void ff_vorbis_ready_floor1_list(floor1_entry_t * list, int values) {
+void ff_vorbis_ready_floor1_list(vorbis_floor1_entry * list, int values) {
     int i;
     list[0].sort = 0;
     list[1].sort = 1;
@@ -162,7 +162,7 @@ static void render_line(int x0, int y0, int x1, int y1, float * buf) {
     }
 }
 
-void ff_vorbis_floor1_render_list(floor1_entry_t * list, int values, uint_fast16_t * y_list, int * flag, int multiplier, float * out, int samples) {
+void ff_vorbis_floor1_render_list(vorbis_floor1_entry * list, int values, uint_fast16_t * y_list, int * flag, int multiplier, float * out, int samples) {
     int lx, ly, i;
     lx = 0;
     ly = y_list[0] * multiplier;
diff --git a/libavcodec/vorbis.h b/libavcodec/vorbis.h
index b8a8410..dc99acc 100644
--- a/libavcodec/vorbis.h
+++ b/libavcodec/vorbis.h
@@ -31,12 +31,12 @@ typedef struct {
     uint_fast16_t sort;
     uint_fast16_t low;
     uint_fast16_t high;
-} floor1_entry_t;
+} vorbis_floor1_entry;
 
-void ff_vorbis_ready_floor1_list(floor1_entry_t * list, int values);
+void ff_vorbis_ready_floor1_list(vorbis_floor1_entry * list, int values);
 unsigned int ff_vorbis_nth_root(unsigned int x, unsigned int n); // x^(1/n)
 int ff_vorbis_len2vlc(uint8_t *bits, uint32_t *codes, uint_fast32_t num);
-void ff_vorbis_floor1_render_list(floor1_entry_t * list, int values, uint_fast16_t * y_list, int * flag, int multiplier, float * out, int samples);
+void ff_vorbis_floor1_render_list(vorbis_floor1_entry * list, int values, uint_fast16_t * y_list, int * flag, int multiplier, float * out, int samples);
 
 #define ilog(i) av_log2(2*(i))
 
diff --git a/libavcodec/vorbis_dec.c b/libavcodec/vorbis_dec.c
index 6c15cb8..16f6dec 100644
--- a/libavcodec/vorbis_dec.c
+++ b/libavcodec/vorbis_dec.c
@@ -90,7 +90,7 @@ typedef struct {
             int_fast16_t subclass_books[16][8];
             uint_fast8_t multiplier;
             uint_fast16_t x_list_dim;
-            floor1_entry_t * list;
+            vorbis_floor1_entry * list;
         } t1;
     } data;
 } vorbis_floor;
@@ -511,7 +511,7 @@ static int vorbis_parse_setup_hdr_floors(vorbis_context *vc) {
                 floor_setup->data.t1.x_list_dim+=floor_setup->data.t1.class_dimensions[floor_setup->data.t1.partition_class[j]];
             }
 
-            floor_setup->data.t1.list=av_mallocz(floor_setup->data.t1.x_list_dim * sizeof(floor1_entry_t));
+            floor_setup->data.t1.list=av_mallocz(floor_setup->data.t1.x_list_dim * sizeof(vorbis_floor1_entry));
 
 
             rangebits=get_bits(gb, 4);
diff --git a/libavcodec/vorbis_enc.c b/libavcodec/vorbis_enc.c
index 0cb644b..43350de 100644
--- a/libavcodec/vorbis_enc.c
+++ b/libavcodec/vorbis_enc.c
@@ -48,25 +48,25 @@ typedef struct {
     int * quantlist;
     float * dimentions;
     float * pow2;
-} codebook_t;
+} vorbis_enc_codebook;
 
 typedef struct {
     int dim;
     int subclass;
     int masterbook;
     int * books;
-} floor_class_t;
+} vorbis_enc_floor_class;
 
 typedef struct {
     int partitions;
     int * partition_to_class;
     int nclasses;
-    floor_class_t * classes;
+    vorbis_enc_floor_class * classes;
     int multiplier;
     int rangebits;
     int values;
-    floor1_entry_t * list;
-} floor_t;
+    vorbis_floor1_entry * list;
+} vorbis_enc_floor;
 
 typedef struct {
     int type;
@@ -77,7 +77,7 @@ typedef struct {
     int classbook;
     int8_t (*books)[8];
     float (*maxes)[2];
-} residue_t;
+} vorbis_enc_residue;
 
 typedef struct {
     int submaps;
@@ -87,12 +87,12 @@ typedef struct {
     int coupling_steps;
     int * magnitude;
     int * angle;
-} mapping_t;
+} vorbis_enc_mapping;
 
 typedef struct {
     int blockflag;
     int mapping;
-} vorbis_mode_t;
+} vorbis_enc_mode;
 
 typedef struct {
     int channels;
@@ -108,24 +108,24 @@ typedef struct {
     float quality;
 
     int ncodebooks;
-    codebook_t * codebooks;
+    vorbis_enc_codebook * codebooks;
 
     int nfloors;
-    floor_t * floors;
+    vorbis_enc_floor * floors;
 
     int nresidues;
-    residue_t * residues;
+    vorbis_enc_residue * residues;
 
     int nmappings;
-    mapping_t * mappings;
+    vorbis_enc_mapping * mappings;
 
     int nmodes;
-    vorbis_mode_t * modes;
+    vorbis_enc_mode * modes;
 
     int64_t sample_count;
-} venc_context_t;
+} vorbis_enc_context;
 
-static inline void put_codeword(PutBitContext * pb, codebook_t * cb, int entry) {
+static inline void put_codeword(PutBitContext * pb, vorbis_enc_codebook * cb, int entry) {
     assert(entry >= 0);
     assert(entry < cb->nentries);
     assert(cb->lens[entry]);
@@ -138,7 +138,7 @@ static int cb_lookup_vals(int lookup, int dimentions, int entries) {
     return 0;
 }
 
-static void ready_codebook(codebook_t * cb) {
+static void ready_codebook(vorbis_enc_codebook * cb) {
     int i;
 
     ff_vorbis_len2vlc(cb->lens, cb->codewords, cb->nentries);
@@ -171,13 +171,13 @@ static void ready_codebook(codebook_t * cb) {
     }
 }
 
-static void ready_residue(residue_t * rc, venc_context_t * venc) {
+static void ready_residue(vorbis_enc_residue * rc, vorbis_enc_context * venc) {
     int i;
     assert(rc->type == 2);
     rc->maxes = av_mallocz(sizeof(float[2]) * rc->classifications);
     for (i = 0; i < rc->classifications; i++) {
         int j;
-        codebook_t * cb;
+        vorbis_enc_codebook * cb;
         for (j = 0; j < 8; j++)
             if (rc->books[i][j] != -1) break;
         if (j == 8) continue; // zero
@@ -203,10 +203,10 @@ static void ready_residue(residue_t * rc, venc_context_t * venc) {
     }
 }
 
-static void create_vorbis_context(venc_context_t * venc, AVCodecContext * avccontext) {
-    floor_t * fc;
-    residue_t * rc;
-    mapping_t * mc;
+static void create_vorbis_context(vorbis_enc_context * venc, AVCodecContext * avccontext) {
+    vorbis_enc_floor * fc;
+    vorbis_enc_residue * rc;
+    vorbis_enc_mapping * mc;
     int i, book;
 
     venc->channels = avccontext->channels;
@@ -214,13 +214,13 @@ static void create_vorbis_context(venc_context_t * venc, AVCodecContext * avccon
     venc->log2_blocksize[0] = venc->log2_blocksize[1] = 11;
 
     venc->ncodebooks = FF_ARRAY_ELEMS(cvectors);
-    venc->codebooks = av_malloc(sizeof(codebook_t) * venc->ncodebooks);
+    venc->codebooks = av_malloc(sizeof(vorbis_enc_codebook) * venc->ncodebooks);
 
     // codebook 0..14 - floor1 book, values 0..255
     // codebook 15 residue masterbook
     // codebook 16..29 residue
     for (book = 0; book < venc->ncodebooks; book++) {
-        codebook_t * cb = &venc->codebooks[book];
+        vorbis_enc_codebook * cb = &venc->codebooks[book];
         int vals;
         cb->ndimentions = cvectors[book].dim;
         cb->nentries = cvectors[book].real_len;
@@ -246,7 +246,7 @@ static void create_vorbis_context(venc_context_t * venc, AVCodecContext * avccon
     }
 
     venc->nfloors = 1;
-    venc->floors = av_malloc(sizeof(floor_t) * venc->nfloors);
+    venc->floors = av_malloc(sizeof(vorbis_enc_floor) * venc->nfloors);
 
     // just 1 floor
     fc = &venc->floors[0];
@@ -259,9 +259,9 @@ static void create_vorbis_context(venc_context_t * venc, AVCodecContext * avccon
         fc->nclasses = FFMAX(fc->nclasses, fc->partition_to_class[i]);
     }
     fc->nclasses++;
-    fc->classes = av_malloc(sizeof(floor_class_t) * fc->nclasses);
+    fc->classes = av_malloc(sizeof(vorbis_enc_floor_class) * fc->nclasses);
     for (i = 0; i < fc->nclasses; i++) {
-        floor_class_t * c = &fc->classes[i];
+        vorbis_enc_floor_class * c = &fc->classes[i];
         int j, books;
         c->dim = floor_classes[i].dim;
         c->subclass = floor_classes[i].subclass;
@@ -278,7 +278,7 @@ static void create_vorbis_context(venc_context_t * venc, AVCodecContext * avccon
     for (i = 0; i < fc->partitions; i++)
         fc->values += fc->classes[fc->partition_to_class[i]].dim;
 
-    fc->list = av_malloc(sizeof(floor1_entry_t) * fc->values);
+    fc->list = av_malloc(sizeof(vorbis_floor1_entry) * fc->values);
     fc->list[0].x = 0;
     fc->list[1].x = 1 << fc->rangebits;
     for (i = 2; i < fc->values; i++) {
@@ -292,7 +292,7 @@ static void create_vorbis_context(venc_context_t * venc, AVCodecContext * avccon
     ff_vorbis_ready_floor1_list(fc->list, fc->values);
 
     venc->nresidues = 1;
-    venc->residues = av_malloc(sizeof(residue_t) * venc->nresidues);
+    venc->residues = av_malloc(sizeof(vorbis_enc_residue) * venc->nresidues);
 
     // single residue
     rc = &venc->residues[0];
@@ -321,7 +321,7 @@ static void create_vorbis_context(venc_context_t * venc, AVCodecContext * avccon
     ready_residue(rc, venc);
 
     venc->nmappings = 1;
-    venc->mappings = av_malloc(sizeof(mapping_t) * venc->nmappings);
+    venc->mappings = av_malloc(sizeof(vorbis_enc_mapping) * venc->nmappings);
 
     // single mapping
     mc = &venc->mappings[0];
@@ -344,7 +344,7 @@ static void create_vorbis_context(venc_context_t * venc, AVCodecContext * avccon
     }
 
     venc->nmodes = 1;
-    venc->modes = av_malloc(sizeof(vorbis_mode_t) * venc->nmodes);
+    venc->modes = av_malloc(sizeof(vorbis_enc_mode) * venc->nmodes);
 
     // single mode
     venc->modes[0].blockflag = 0;
@@ -373,7 +373,7 @@ static void put_float(PutBitContext * pb, float f) {
     put_bits(pb, 32, res);
 }
 
-static void put_codebook_header(PutBitContext * pb, codebook_t * cb) {
+static void put_codebook_header(PutBitContext * pb, vorbis_enc_codebook * cb) {
     int i;
     int ordered = 0;
 
@@ -432,7 +432,7 @@ static void put_codebook_header(PutBitContext * pb, codebook_t * cb) {
     }
 }
 
-static void put_floor_header(PutBitContext * pb, floor_t * fc) {
+static void put_floor_header(PutBitContext * pb, vorbis_enc_floor * fc) {
     int i;
 
     put_bits(pb, 16, 1); // type, only floor1 is supported
@@ -464,7 +464,7 @@ static void put_floor_header(PutBitContext * pb, floor_t * fc) {
         put_bits(pb, fc->rangebits, fc->list[i].x);
 }
 
-static void put_residue_header(PutBitContext * pb, residue_t * rc) {
+static void put_residue_header(PutBitContext * pb, vorbis_enc_residue * rc) {
     int i;
 
     put_bits(pb, 16, rc->type);
@@ -495,7 +495,7 @@ static void put_residue_header(PutBitContext * pb, residue_t * rc) {
     }
 }
 
-static int put_main_header(venc_context_t * venc, uint8_t ** out) {
+static int put_main_header(vorbis_enc_context * venc, uint8_t ** out) {
     int i;
     PutBitContext pb;
     uint8_t buffer[50000] = {0}, * p = buffer;
@@ -564,7 +564,7 @@ static int put_main_header(venc_context_t * venc, uint8_t ** out) {
     // mappings
     put_bits(&pb, 6, venc->nmappings - 1);
     for (i = 0; i < venc->nmappings; i++) {
-        mapping_t * mc = &venc->mappings[i];
+        vorbis_enc_mapping * mc = &venc->mappings[i];
         int j;
         put_bits(&pb, 16, 0); // mapping type
 
@@ -624,7 +624,7 @@ static int put_main_header(venc_context_t * venc, uint8_t ** out) {
     return p - *out;
 }
 
-static float get_floor_average(floor_t * fc, float * coeffs, int i) {
+static float get_floor_average(vorbis_enc_floor * fc, float * coeffs, int i) {
     int begin = fc->list[fc->list[FFMAX(i-1, 0)].sort].x;
     int end   = fc->list[fc->list[FFMIN(i+1, fc->values - 1)].sort].x;
     int j;
@@ -635,7 +635,7 @@ static float get_floor_average(floor_t * fc, float * coeffs, int i) {
     return average / (end - begin);
 }
 
-static void floor_fit(venc_context_t * venc, floor_t * fc, float * coeffs, uint_fast16_t * posts, int samples) {
+static void floor_fit(vorbis_enc_context * venc, vorbis_enc_floor * fc, float * coeffs, uint_fast16_t * posts, int samples) {
     int range = 255 / fc->multiplier + 1;
     int i;
     float tot_average = 0.;
@@ -663,7 +663,7 @@ static int render_point(int x0, int y0, int x1, int y1, int x) {
     return y0 +  (x - x0) * (y1 - y0) / (x1 - x0);
 }
 
-static void floor_encode(venc_context_t * venc, floor_t * fc, PutBitContext * pb, uint_fast16_t * posts, float * floor, int samples) {
+static void floor_encode(vorbis_enc_context * venc, vorbis_enc_floor * fc, PutBitContext * pb, uint_fast16_t * posts, float * floor, int samples) {
     int range = 255 / fc->multiplier + 1;
     int coded[fc->values]; // first 2 values are unused
     int i, counter;
@@ -704,10 +704,10 @@ static void floor_encode(venc_context_t * venc, floor_t * fc, PutBitContext * pb
 
     counter = 2;
     for (i = 0; i < fc->partitions; i++) {
-        floor_class_t * c = &fc->classes[fc->partition_to_class[i]];
+        vorbis_enc_floor_class * c = &fc->classes[fc->partition_to_class[i]];
         int k, cval = 0, csub = 1<<c->subclass;
         if (c->subclass) {
-            codebook_t * book = &venc->codebooks[c->masterbook];
+            vorbis_enc_codebook * book = &venc->codebooks[c->masterbook];
             int cshift = 0;
             for (k = 0; k < c->dim; k++) {
                 int l;
@@ -737,7 +737,7 @@ static void floor_encode(venc_context_t * venc, floor_t * fc, PutBitContext * pb
     ff_vorbis_floor1_render_list(fc->list, fc->values, posts, coded, fc->multiplier, floor, samples);
 }
 
-static float * put_vector(codebook_t * book, PutBitContext * pb, float * num) {
+static float * put_vector(vorbis_enc_codebook * book, PutBitContext * pb, float * num) {
     int i, entry = -1;
     float distance = FLT_MAX;
     assert(book->dimentions);
@@ -756,7 +756,7 @@ static float * put_vector(codebook_t * book, PutBitContext * pb, float * num) {
     return &book->dimentions[entry * book->ndimentions];
 }
 
-static void residue_encode(venc_context_t * venc, residue_t * rc, PutBitContext * pb, float * coeffs, int samples, int real_ch) {
+static void residue_encode(vorbis_enc_context * venc, vorbis_enc_residue * rc, PutBitContext * pb, float * coeffs, int samples, int real_ch) {
     int pass, i, j, p, k;
     int psize = rc->partition_size;
     int partitions = (rc->end - rc->begin) / psize;
@@ -785,7 +785,7 @@ static void residue_encode(venc_context_t * venc, residue_t * rc, PutBitContext
         while (p < partitions) {
             if (pass == 0)
                 for (j = 0; j < channels; j++) {
-                    codebook_t * book = &venc->codebooks[rc->classbook];
+                    vorbis_enc_codebook * book = &venc->codebooks[rc->classbook];
                     int entry = 0;
                     for (i = 0; i < classwords; i++) {
                         entry *= rc->classifications;
@@ -796,7 +796,7 @@ static void residue_encode(venc_context_t * venc, residue_t * rc, PutBitContext
             for (i = 0; i < classwords && p < partitions; i++, p++) {
                 for (j = 0; j < channels; j++) {
                     int nbook = rc->books[classes[j][p]][pass];
-                    codebook_t * book = &venc->codebooks[nbook];
+                    vorbis_enc_codebook * book = &venc->codebooks[nbook];
                     float * buf = coeffs + samples*j + rc->begin + p*psize;
                     if (nbook == -1) continue;
 
@@ -841,7 +841,7 @@ static void residue_encode(venc_context_t * venc, residue_t * rc, PutBitContext
     }
 }
 
-static int apply_window_and_mdct(venc_context_t * venc, signed short * audio, int samples) {
+static int apply_window_and_mdct(vorbis_enc_context * venc, signed short * audio, int samples) {
     int i, j, channel;
     const float * win = venc->win[0];
     int window_len = 1 << (venc->log2_blocksize[0] - 1);
@@ -893,7 +893,7 @@ static int apply_window_and_mdct(venc_context_t * venc, signed short * audio, in
 
 static av_cold int vorbis_encode_init(AVCodecContext * avccontext)
 {
-    venc_context_t * venc = avccontext->priv_data;
+    vorbis_enc_context * venc = avccontext->priv_data;
 
     if (avccontext->channels != 2) {
         av_log(avccontext, AV_LOG_ERROR, "Current FFmpeg Vorbis encoder only supports 2 channels.\n");
@@ -920,11 +920,11 @@ static av_cold int vorbis_encode_init(AVCodecContext * avccontext)
 
 static int vorbis_encode_frame(AVCodecContext * avccontext, unsigned char * packets, int buf_size, void *data)
 {
-    venc_context_t * venc = avccontext->priv_data;
+    vorbis_enc_context * venc = avccontext->priv_data;
     signed short * audio = data;
     int samples = data ? avccontext->frame_size : 0;
-    vorbis_mode_t * mode;
-    mapping_t * mapping;
+    vorbis_enc_mode * mode;
+    vorbis_enc_mapping * mapping;
     PutBitContext pb;
     int i;
 
@@ -945,7 +945,7 @@ static int vorbis_encode_frame(AVCodecContext * avccontext, unsigned char * pack
     }
 
     for (i = 0; i < venc->channels; i++) {
-        floor_t * fc = &venc->floors[mapping->floor[mapping->mux[i]]];
+        vorbis_enc_floor * fc = &venc->floors[mapping->floor[mapping->mux[i]]];
         uint_fast16_t posts[fc->values];
         floor_fit(venc, fc, &venc->coeffs[i * samples], posts, samples);
         floor_encode(venc, fc, &pb, posts, &venc->floor[i * samples], samples);
@@ -978,7 +978,7 @@ static int vorbis_encode_frame(AVCodecContext * avccontext, unsigned char * pack
 
 static av_cold int vorbis_encode_close(AVCodecContext * avccontext)
 {
-    venc_context_t * venc = avccontext->priv_data;
+    vorbis_enc_context * venc = avccontext->priv_data;
     int i;
 
     if (venc->codebooks)
@@ -1040,7 +1040,7 @@ AVCodec vorbis_encoder = {
     "vorbis",
     CODEC_TYPE_AUDIO,
     CODEC_ID_VORBIS,
-    sizeof(venc_context_t),
+    sizeof(vorbis_enc_context),
     vorbis_encode_init,
     vorbis_encode_frame,
     vorbis_encode_close,
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index a5b97ad..2737025 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -1402,14 +1402,14 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                     /* dequantize the DCT coefficients */
                     if(s->avctx->idct_algo==FF_IDCT_VP3){
                         Coeff *coeff= s->coeffs + i;
-                        memset(block, 0, sizeof(block));
+                        s->dsp.clear_block(block);
                         while(coeff->next){
                             block[coeff->index]= coeff->coeff * dequantizer[coeff->index];
                             coeff= coeff->next;
                         }
                     }else{
                         Coeff *coeff= s->coeffs + i;
-                        memset(block, 0, sizeof(block));
+                        s->dsp.clear_block(block);
                         while(coeff->next){
                             block[coeff->index]= (coeff->coeff * dequantizer[coeff->index] + 2)>>2;
                             coeff= coeff->next;
diff --git a/libavcodec/vp5.c b/libavcodec/vp5.c
index 4f9d42f..e47db0d 100644
--- a/libavcodec/vp5.c
+++ b/libavcodec/vp5.c
@@ -33,10 +33,10 @@
 #include "vp5data.h"
 
 
-static int vp5_parse_header(vp56_context_t *s, const uint8_t *buf, int buf_size,
+static int vp5_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
                             int *golden_frame)
 {
-    vp56_range_coder_t *c = &s->c;
+    VP56RangeCoder *c = &s->c;
     int rows, cols;
 
     vp56_init_range_decoder(&s->c, buf, buf_size);
@@ -58,7 +58,8 @@ static int vp5_parse_header(vp56_context_t *s, const uint8_t *buf, int buf_size,
         vp56_rac_gets(c, 8);  /* number of displayed macroblock rows */
         vp56_rac_gets(c, 8);  /* number of displayed macroblock cols */
         vp56_rac_gets(c, 2);
-        if (16*cols != s->avctx->coded_width ||
+        if (!s->macroblocks || /* first frame */
+            16*cols != s->avctx->coded_width ||
             16*rows != s->avctx->coded_height) {
             avcodec_set_dimensions(s->avctx, 16*cols, 16*rows);
             return 2;
@@ -84,10 +85,10 @@ static int vp5_adjust(int v, int t)
     return v;
 }
 
-static void vp5_parse_vector_adjustment(vp56_context_t *s, vp56_mv_t *vect)
+static void vp5_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
 {
-    vp56_range_coder_t *c = &s->c;
-    vp56_model_t *model = s->modelp;
+    VP56RangeCoder *c = &s->c;
+    VP56Model *model = s->modelp;
     int comp, di;
 
     for (comp=0; comp<2; comp++) {
@@ -108,10 +109,10 @@ static void vp5_parse_vector_adjustment(vp56_context_t *s, vp56_mv_t *vect)
     }
 }
 
-static void vp5_parse_vector_models(vp56_context_t *s)
+static void vp5_parse_vector_models(VP56Context *s)
 {
-    vp56_range_coder_t *c = &s->c;
-    vp56_model_t *model = s->modelp;
+    VP56RangeCoder *c = &s->c;
+    VP56Model *model = s->modelp;
     int comp, node;
 
     for (comp=0; comp<2; comp++) {
@@ -131,10 +132,10 @@ static void vp5_parse_vector_models(vp56_context_t *s)
                 model->vector_pdv[comp][node] = vp56_rac_gets_nn(c, 7);
 }
 
-static void vp5_parse_coeff_models(vp56_context_t *s)
+static void vp5_parse_coeff_models(VP56Context *s)
 {
-    vp56_range_coder_t *c = &s->c;
-    vp56_model_t *model = s->modelp;
+    VP56RangeCoder *c = &s->c;
+    VP56Model *model = s->modelp;
     uint8_t def_prob[11];
     int node, cg, ctx;
     int ct;    /* code type */
@@ -177,10 +178,10 @@ static void vp5_parse_coeff_models(vp56_context_t *s)
                         model->coeff_acct[pt][ct][cg][ctx][node] = av_clip(((model->coeff_ract[pt][ct][cg][node] * vp5_ract_lc[ct][cg][node][ctx][0] + 128) >> 8) + vp5_ract_lc[ct][cg][node][ctx][1], 1, 254);
 }
 
-static void vp5_parse_coeff(vp56_context_t *s)
+static void vp5_parse_coeff(VP56Context *s)
 {
-    vp56_range_coder_t *c = &s->c;
-    vp56_model_t *model = s->modelp;
+    VP56RangeCoder *c = &s->c;
+    VP56Model *model = s->modelp;
     uint8_t *permute = s->scantable.permutated;
     uint8_t *model1, *model2;
     int coeff, sign, coeff_idx;
@@ -250,9 +251,9 @@ static void vp5_parse_coeff(vp56_context_t *s)
     }
 }
 
-static void vp5_default_models_init(vp56_context_t *s)
+static void vp5_default_models_init(VP56Context *s)
 {
-    vp56_model_t *model = s->modelp;
+    VP56Model *model = s->modelp;
     int i;
 
     for (i=0; i<2; i++) {
@@ -267,7 +268,7 @@ static void vp5_default_models_init(vp56_context_t *s)
 
 static av_cold int vp5_decode_init(AVCodecContext *avctx)
 {
-    vp56_context_t *s = avctx->priv_data;
+    VP56Context *s = avctx->priv_data;
 
     vp56_init(avctx, 1, 0);
     s->vp56_coord_div = vp5_coord_div;
@@ -286,7 +287,7 @@ AVCodec vp5_decoder = {
     "vp5",
     CODEC_TYPE_VIDEO,
     CODEC_ID_VP5,
-    sizeof(vp56_context_t),
+    sizeof(VP56Context),
     vp5_decode_init,
     NULL,
     vp56_free,
diff --git a/libavcodec/vp56.c b/libavcodec/vp56.c
index c9daaf7..79667cf 100644
--- a/libavcodec/vp56.c
+++ b/libavcodec/vp56.c
@@ -28,20 +28,20 @@
 #include "vp56data.h"
 
 
-void vp56_init_dequant(vp56_context_t *s, int quantizer)
+void vp56_init_dequant(VP56Context *s, int quantizer)
 {
     s->quantizer = quantizer;
     s->dequant_dc = vp56_dc_dequant[quantizer] << 2;
     s->dequant_ac = vp56_ac_dequant[quantizer] << 2;
 }
 
-static int vp56_get_vectors_predictors(vp56_context_t *s, int row, int col,
-                                       vp56_frame_t ref_frame)
+static int vp56_get_vectors_predictors(VP56Context *s, int row, int col,
+                                       VP56Frame ref_frame)
 {
     int nb_pred = 0;
-    vp56_mv_t vect[2] = {{0,0}, {0,0}};
+    VP56mv vect[2] = {{0,0}, {0,0}};
     int pos, offset;
-    vp56_mv_t mvp;
+    VP56mv mvp;
 
     for (pos=0; pos<12; pos++) {
         mvp.x = col + vp56_candidate_predictor_pos[pos][0];
@@ -73,10 +73,10 @@ static int vp56_get_vectors_predictors(vp56_context_t *s, int row, int col,
     return nb_pred+1;
 }
 
-static void vp56_parse_mb_type_models(vp56_context_t *s)
+static void vp56_parse_mb_type_models(VP56Context *s)
 {
-    vp56_range_coder_t *c = &s->c;
-    vp56_model_t *model = s->modelp;
+    VP56RangeCoder *c = &s->c;
+    VP56Model *model = s->modelp;
     int i, ctx, type;
 
     for (ctx=0; ctx<3; ctx++) {
@@ -144,11 +144,11 @@ static void vp56_parse_mb_type_models(vp56_context_t *s)
     }
 }
 
-static vp56_mb_t vp56_parse_mb_type(vp56_context_t *s,
-                                    vp56_mb_t prev_type, int ctx)
+static VP56mb vp56_parse_mb_type(VP56Context *s,
+                                 VP56mb prev_type, int ctx)
 {
     uint8_t *mb_type_model = s->modelp->mb_type[ctx][prev_type];
-    vp56_range_coder_t *c = &s->c;
+    VP56RangeCoder *c = &s->c;
 
     if (vp56_rac_get_prob(c, mb_type_model[0]))
         return prev_type;
@@ -156,9 +156,9 @@ static vp56_mb_t vp56_parse_mb_type(vp56_context_t *s,
         return vp56_rac_get_tree(c, vp56_pmbt_tree, mb_type_model);
 }
 
-static void vp56_decode_4mv(vp56_context_t *s, int row, int col)
+static void vp56_decode_4mv(VP56Context *s, int row, int col)
 {
-    vp56_mv_t mv = {0,0};
+    VP56mv mv = {0,0};
     int type[4];
     int b;
 
@@ -173,7 +173,7 @@ static void vp56_decode_4mv(vp56_context_t *s, int row, int col)
     for (b=0; b<4; b++) {
         switch (type[b]) {
             case VP56_MB_INTER_NOVEC_PF:
-                s->mv[b] = (vp56_mv_t) {0,0};
+                s->mv[b] = (VP56mv) {0,0};
                 break;
             case VP56_MB_INTER_DELTA_PF:
                 s->parse_vector_adjustment(s, &s->mv[b]);
@@ -197,13 +197,13 @@ static void vp56_decode_4mv(vp56_context_t *s, int row, int col)
         s->mv[4].x = s->mv[5].x = RSHIFT(mv.x,2);
         s->mv[4].y = s->mv[5].y = RSHIFT(mv.y,2);
     } else {
-        s->mv[4] = s->mv[5] = (vp56_mv_t) {mv.x/4, mv.y/4};
+        s->mv[4] = s->mv[5] = (VP56mv) {mv.x/4, mv.y/4};
     }
 }
 
-static vp56_mb_t vp56_decode_mv(vp56_context_t *s, int row, int col)
+static VP56mb vp56_decode_mv(VP56Context *s, int row, int col)
 {
-    vp56_mv_t *mv, vect = {0,0};
+    VP56mv *mv, vect = {0,0};
     int ctx, b;
 
     ctx = vp56_get_vectors_predictors(s, row, col, VP56_FRAME_PREVIOUS);
@@ -258,14 +258,14 @@ static vp56_mb_t vp56_decode_mv(vp56_context_t *s, int row, int col)
     return s->mb_type;
 }
 
-static void vp56_add_predictors_dc(vp56_context_t *s, vp56_frame_t ref_frame)
+static void vp56_add_predictors_dc(VP56Context *s, VP56Frame ref_frame)
 {
     int idx = s->scantable.permutated[0];
     int b;
 
     for (b=0; b<6; b++) {
-        vp56_ref_dc_t *ab = &s->above_blocks[s->above_block_idx[b]];
-        vp56_ref_dc_t *lb = &s->left_block[vp56_b6to4[b]];
+        VP56RefDc *ab = &s->above_blocks[s->above_block_idx[b]];
+        VP56RefDc *lb = &s->left_block[vp56_b6to4[b]];
         int count = 0;
         int dc = 0;
         int i;
@@ -299,7 +299,7 @@ static void vp56_add_predictors_dc(vp56_context_t *s, vp56_frame_t ref_frame)
     }
 }
 
-static void vp56_edge_filter(vp56_context_t *s, uint8_t *yuv,
+static void vp56_edge_filter(VP56Context *s, uint8_t *yuv,
                              int pix_inc, int line_inc, int t)
 {
     int pix2_inc = 2 * pix_inc;
@@ -314,7 +314,7 @@ static void vp56_edge_filter(vp56_context_t *s, uint8_t *yuv,
     }
 }
 
-static void vp56_deblock_filter(vp56_context_t *s, uint8_t *yuv,
+static void vp56_deblock_filter(VP56Context *s, uint8_t *yuv,
                                 int stride, int dx, int dy)
 {
     int t = vp56_filter_threshold[s->quantizer];
@@ -322,7 +322,7 @@ static void vp56_deblock_filter(vp56_context_t *s, uint8_t *yuv,
     if (dy)  vp56_edge_filter(s, yuv + stride*(10-dy), stride,      1, t);
 }
 
-static void vp56_mc(vp56_context_t *s, int b, int plane, uint8_t *src,
+static void vp56_mc(VP56Context *s, int b, int plane, uint8_t *src,
                     int stride, int x, int y)
 {
     uint8_t *dst=s->framep[VP56_FRAME_CURRENT]->data[plane]+s->block_offset[b];
@@ -392,11 +392,11 @@ static void vp56_mc(vp56_context_t *s, int b, int plane, uint8_t *src,
     }
 }
 
-static void vp56_decode_mb(vp56_context_t *s, int row, int col, int is_alpha)
+static void vp56_decode_mb(VP56Context *s, int row, int col, int is_alpha)
 {
     AVFrame *frame_current, *frame_ref;
-    vp56_mb_t mb_type;
-    vp56_frame_t ref_frame;
+    VP56mb mb_type;
+    VP56Frame ref_frame;
     int b, ab, b_max, plane, off;
 
     if (s->framep[VP56_FRAME_CURRENT]->key_frame)
@@ -405,7 +405,7 @@ static void vp56_decode_mb(vp56_context_t *s, int row, int col, int is_alpha)
         mb_type = vp56_decode_mv(s, row, col);
     ref_frame = vp56_reference_frame[mb_type];
 
-    memset(s->block_coeff, 0, sizeof(s->block_coeff));
+    s->dsp.clear_blocks(*s->block_coeff);
 
     s->parse_coeff(s);
 
@@ -461,7 +461,7 @@ static void vp56_decode_mb(vp56_context_t *s, int row, int col, int is_alpha)
 
 static int vp56_size_changed(AVCodecContext *avctx)
 {
-    vp56_context_t *s = avctx->priv_data;
+    VP56Context *s = avctx->priv_data;
     int stride = s->framep[VP56_FRAME_CURRENT]->linesize[0];
     int i;
 
@@ -497,7 +497,7 @@ static int vp56_size_changed(AVCodecContext *avctx)
 int vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
                       const uint8_t *buf, int buf_size)
 {
-    vp56_context_t *s = avctx->priv_data;
+    VP56Context *s = avctx->priv_data;
     AVFrame *const p = s->framep[VP56_FRAME_CURRENT];
     int remaining_buf_size = buf_size;
     int is_alpha, alpha_offset;
@@ -646,7 +646,7 @@ int vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
 
 av_cold void vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
 {
-    vp56_context_t *s = avctx->priv_data;
+    VP56Context *s = avctx->priv_data;
     int i;
 
     s->avctx = avctx;
@@ -657,8 +657,6 @@ av_cold void vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
     dsputil_init(&s->dsp, avctx);
     ff_init_scantable(s->dsp.idct_permutation, &s->scantable,ff_zigzag_direct);
 
-    avcodec_set_dimensions(avctx, 0, 0);
-
     for (i=0; i<4; i++)
         s->framep[i] = &s->frames[i];
     s->framep[VP56_FRAME_UNUSED] = s->framep[VP56_FRAME_GOLDEN];
@@ -686,7 +684,7 @@ av_cold void vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
 
 av_cold int vp56_free(AVCodecContext *avctx)
 {
-    vp56_context_t *s = avctx->priv_data;
+    VP56Context *s = avctx->priv_data;
 
     av_free(s->above_blocks);
     av_free(s->macroblocks);
diff --git a/libavcodec/vp56.h b/libavcodec/vp56.h
index 991e94b..aaa18dd 100644
--- a/libavcodec/vp56.h
+++ b/libavcodec/vp56.h
@@ -30,34 +30,34 @@
 #include "bytestream.h"
 
 
-typedef struct vp56_context vp56_context_t;
-typedef struct vp56_mv vp56_mv_t;
-
-typedef void (*vp56_parse_vector_adjustment_t)(vp56_context_t *s,
-                                               vp56_mv_t *vect);
-typedef int (*vp56_adjust_t)(int v, int t);
-typedef void (*vp56_filter_t)(vp56_context_t *s, uint8_t *dst, uint8_t *src,
-                              int offset1, int offset2, int stride,
-                              vp56_mv_t mv, int mask, int select, int luma);
-typedef void (*vp56_parse_coeff_t)(vp56_context_t *s);
-typedef void (*vp56_default_models_init_t)(vp56_context_t *s);
-typedef void (*vp56_parse_vector_models_t)(vp56_context_t *s);
-typedef void (*vp56_parse_coeff_models_t)(vp56_context_t *s);
-typedef int (*vp56_parse_header_t)(vp56_context_t *s, const uint8_t *buf,
-                                   int buf_size, int *golden_frame);
+typedef struct vp56_context VP56Context;
+typedef struct vp56_mv VP56mv;
+
+typedef void (*VP56ParseVectorAdjustment)(VP56Context *s,
+                                          VP56mv *vect);
+typedef int  (*VP56Adjust)(int v, int t);
+typedef void (*VP56Filter)(VP56Context *s, uint8_t *dst, uint8_t *src,
+                           int offset1, int offset2, int stride,
+                           VP56mv mv, int mask, int select, int luma);
+typedef void (*VP56ParseCoeff)(VP56Context *s);
+typedef void (*VP56DefaultModelsInit)(VP56Context *s);
+typedef void (*VP56ParseVectorModels)(VP56Context *s);
+typedef void (*VP56ParseCoeffModels)(VP56Context *s);
+typedef int  (*VP56ParseHeader)(VP56Context *s, const uint8_t *buf,
+                                int buf_size, int *golden_frame);
 
 typedef struct {
     int high;
     int bits;
     const uint8_t *buffer;
     unsigned long code_word;
-} vp56_range_coder_t;
+} VP56RangeCoder;
 
 typedef struct {
     uint8_t not_null_dc;
-    vp56_frame_t ref_frame;
+    VP56Frame ref_frame;
     DCTELEM dc_coeff;
-} vp56_ref_dc_t;
+} VP56RefDc;
 
 struct vp56_mv {
     int x;
@@ -66,8 +66,8 @@ struct vp56_mv {
 
 typedef struct {
     uint8_t type;
-    vp56_mv_t mv;
-} vp56_macroblock_t;
+    VP56mv mv;
+} VP56Macroblock;
 
 typedef struct {
     uint8_t coeff_reorder[64];       /* used in vp6 only */
@@ -84,7 +84,7 @@ typedef struct {
     uint8_t coeff_runv[2][14];       /* run value (vp6 only) */
     uint8_t mb_type[3][10][10];      /* model for decoding MB type */
     uint8_t mb_types_stats[3][10][2];/* contextual, next MB type stats */
-} vp56_model_t;
+} VP56Model;
 
 struct vp56_context {
     AVCodecContext *avctx;
@@ -94,9 +94,9 @@ struct vp56_context {
     AVFrame *framep[6];
     uint8_t *edge_emu_buffer_alloc;
     uint8_t *edge_emu_buffer;
-    vp56_range_coder_t c;
-    vp56_range_coder_t cc;
-    vp56_range_coder_t *ccp;
+    VP56RangeCoder c;
+    VP56RangeCoder cc;
+    VP56RangeCoder *ccp;
     int sub_version;
 
     /* frame info */
@@ -111,19 +111,19 @@ struct vp56_context {
     uint16_t dequant_ac;
 
     /* DC predictors management */
-    vp56_ref_dc_t *above_blocks;
-    vp56_ref_dc_t left_block[4];
+    VP56RefDc *above_blocks;
+    VP56RefDc left_block[4];
     int above_block_idx[6];
     DCTELEM prev_dc[3][3];    /* [plan][ref_frame] */
 
     /* blocks / macroblock */
-    vp56_mb_t mb_type;
-    vp56_macroblock_t *macroblocks;
+    VP56mb mb_type;
+    VP56Macroblock *macroblocks;
     DECLARE_ALIGNED_16(DCTELEM, block_coeff[6][64]);
 
     /* motion vectors */
-    vp56_mv_t mv[6];  /* vectors for each block in MB */
-    vp56_mv_t vector_candidate[2];
+    VP56mv mv[6];  /* vectors for each block in MB */
+    VP56mv vector_candidate[2];
     int vector_candidate_pos;
 
     /* filtering hints */
@@ -146,17 +146,17 @@ struct vp56_context {
     int stride[4];  /* stride for each plan */
 
     const uint8_t *vp56_coord_div;
-    vp56_parse_vector_adjustment_t parse_vector_adjustment;
-    vp56_adjust_t adjust;
-    vp56_filter_t filter;
-    vp56_parse_coeff_t parse_coeff;
-    vp56_default_models_init_t default_models_init;
-    vp56_parse_vector_models_t parse_vector_models;
-    vp56_parse_coeff_models_t parse_coeff_models;
-    vp56_parse_header_t parse_header;
-
-    vp56_model_t *modelp;
-    vp56_model_t models[2];
+    VP56ParseVectorAdjustment parse_vector_adjustment;
+    VP56Adjust adjust;
+    VP56Filter filter;
+    VP56ParseCoeff parse_coeff;
+    VP56DefaultModelsInit default_models_init;
+    VP56ParseVectorModels parse_vector_models;
+    VP56ParseCoeffModels parse_coeff_models;
+    VP56ParseHeader parse_header;
+
+    VP56Model *modelp;
+    VP56Model models[2];
 
     /* huffman decoding */
     int use_huffman;
@@ -170,7 +170,7 @@ struct vp56_context {
 
 void vp56_init(AVCodecContext *avctx, int flip, int has_alpha);
 int vp56_free(AVCodecContext *avctx);
-void vp56_init_dequant(vp56_context_t *s, int quantizer);
+void vp56_init_dequant(VP56Context *s, int quantizer);
 int vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
                       const uint8_t *buf, int buf_size);
 
@@ -179,7 +179,7 @@ int vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
  * vp56 specific range coder implementation
  */
 
-static inline void vp56_init_range_decoder(vp56_range_coder_t *c,
+static inline void vp56_init_range_decoder(VP56RangeCoder *c,
                                            const uint8_t *buf, int buf_size)
 {
     c->high = 255;
@@ -188,7 +188,7 @@ static inline void vp56_init_range_decoder(vp56_range_coder_t *c,
     c->code_word = bytestream_get_be16(&c->buffer);
 }
 
-static inline int vp56_rac_get_prob(vp56_range_coder_t *c, uint8_t prob)
+static inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
 {
     unsigned int low = 1 + (((c->high - 1) * prob) / 256);
     unsigned int low_shift = low << 8;
@@ -213,7 +213,7 @@ static inline int vp56_rac_get_prob(vp56_range_coder_t *c, uint8_t prob)
     return bit;
 }
 
-static inline int vp56_rac_get(vp56_range_coder_t *c)
+static inline int vp56_rac_get(VP56RangeCoder *c)
 {
     /* equiprobable */
     int low = (c->high + 1) >> 1;
@@ -235,7 +235,7 @@ static inline int vp56_rac_get(vp56_range_coder_t *c)
     return bit;
 }
 
-static inline int vp56_rac_gets(vp56_range_coder_t *c, int bits)
+static inline int vp56_rac_gets(VP56RangeCoder *c, int bits)
 {
     int value = 0;
 
@@ -246,14 +246,14 @@ static inline int vp56_rac_gets(vp56_range_coder_t *c, int bits)
     return value;
 }
 
-static inline int vp56_rac_gets_nn(vp56_range_coder_t *c, int bits)
+static inline int vp56_rac_gets_nn(VP56RangeCoder *c, int bits)
 {
     int v = vp56_rac_gets(c, 7) << 1;
     return v + !v;
 }
 
-static inline int vp56_rac_get_tree(vp56_range_coder_t *c,
-                                    const vp56_tree_t *tree,
+static inline int vp56_rac_get_tree(VP56RangeCoder *c,
+                                    const VP56Tree *tree,
                                     const uint8_t *probs)
 {
     while (tree->val > 0) {
diff --git a/libavcodec/vp56data.c b/libavcodec/vp56data.c
index a7171c6..9b98014 100644
--- a/libavcodec/vp56data.c
+++ b/libavcodec/vp56data.c
@@ -44,7 +44,7 @@ const uint8_t vp56_def_mb_types_stats[3][10][2] = {
       {   1,  2 }, {   0,  1 }, {  0,   1 }, {   1,  1 }, {  0,  0 }, },
 };
 
-const vp56_tree_t vp56_pva_tree[] = {
+const VP56Tree vp56_pva_tree[] = {
     { 8, 0},
     { 4, 1},
     { 2, 2}, {-0}, {-1},
@@ -54,7 +54,7 @@ const vp56_tree_t vp56_pva_tree[] = {
     { 2, 6}, {-6}, {-7},
 };
 
-const vp56_tree_t vp56_pc_tree[] = {
+const VP56Tree vp56_pc_tree[] = {
     { 4, 6},
     { 2, 7}, {-0}, {-1},
     { 4, 8},
diff --git a/libavcodec/vp56data.h b/libavcodec/vp56data.h
index 95dc633..92f2512 100644
--- a/libavcodec/vp56data.h
+++ b/libavcodec/vp56data.h
@@ -34,7 +34,7 @@ typedef enum {
     VP56_FRAME_GOLDEN2  = 3,
     VP56_FRAME_UNUSED   = 4,
     VP56_FRAME_UNUSED2  = 5,
-} vp56_frame_t;
+} VP56Frame;
 
 typedef enum {
     VP56_MB_INTER_NOVEC_PF = 0,  /**< Inter MB, no vector, from previous frame */
@@ -47,23 +47,23 @@ typedef enum {
     VP56_MB_INTER_4V       = 7,  /**< Inter MB, 4 vectors, from previous frame */
     VP56_MB_INTER_V1_GF    = 8,  /**< Inter MB, first vector, from golden frame */
     VP56_MB_INTER_V2_GF    = 9,  /**< Inter MB, second vector, from golden frame */
-} vp56_mb_t;
+} VP56mb;
 
 typedef struct {
   int8_t val;
   int8_t prob_idx;
-} vp56_tree_t;
+} VP56Tree;
 
 extern const uint8_t vp56_b2p[];
 extern const uint8_t vp56_b6to4[];
 extern const uint8_t vp56_coeff_parse_table[6][11];
 extern const uint8_t vp56_def_mb_types_stats[3][10][2];
-extern const vp56_tree_t vp56_pva_tree[];
-extern const vp56_tree_t vp56_pc_tree[];
+extern const VP56Tree vp56_pva_tree[];
+extern const VP56Tree vp56_pc_tree[];
 extern const uint8_t vp56_coeff_bias[];
 extern const uint8_t vp56_coeff_bit_length[];
 
-static const vp56_frame_t vp56_reference_frame[] = {
+static const VP56Frame vp56_reference_frame[] = {
     VP56_FRAME_PREVIOUS,  /* VP56_MB_INTER_NOVEC_PF */
     VP56_FRAME_CURRENT,   /* VP56_MB_INTRA */
     VP56_FRAME_PREVIOUS,  /* VP56_MB_INTER_DELTA_PF */
@@ -212,7 +212,7 @@ static const uint8_t vp56_mb_type_model_model[] = {
     171, 83, 199, 140, 125, 104,
 };
 
-static const vp56_tree_t vp56_pmbtm_tree[] = {
+static const VP56Tree vp56_pmbtm_tree[] = {
     { 4, 0},
     { 2, 1}, {-8}, {-4},
     { 8, 2},
@@ -221,7 +221,7 @@ static const vp56_tree_t vp56_pmbtm_tree[] = {
     { 2, 5}, {-24}, {-20}, {-16}, {-12}, {-0},
 };
 
-static const vp56_tree_t vp56_pmbt_tree[] = {
+static const VP56Tree vp56_pmbt_tree[] = {
     { 8, 1},
     { 4, 2},
     { 2, 4}, {-VP56_MB_INTER_NOVEC_PF}, {-VP56_MB_INTER_DELTA_PF},
diff --git a/libavcodec/vp6.c b/libavcodec/vp6.c
index bea7c9e..6ffde07 100644
--- a/libavcodec/vp6.c
+++ b/libavcodec/vp6.c
@@ -37,13 +37,13 @@
 #include "vp6data.h"
 
 
-static void vp6_parse_coeff(vp56_context_t *s);
-static void vp6_parse_coeff_huffman(vp56_context_t *s);
+static void vp6_parse_coeff(VP56Context *s);
+static void vp6_parse_coeff_huffman(VP56Context *s);
 
-static int vp6_parse_header(vp56_context_t *s, const uint8_t *buf, int buf_size,
+static int vp6_parse_header(VP56Context *s, const uint8_t *buf, int buf_size,
                             int *golden_frame)
 {
-    vp56_range_coder_t *c = &s->c;
+    VP56RangeCoder *c = &s->c;
     int parse_filter_info = 0;
     int coeff_offset = 0;
     int vrt_shift = 0;
@@ -75,7 +75,8 @@ static int vp6_parse_header(vp56_context_t *s, const uint8_t *buf, int buf_size,
         /* buf[4] is number of displayed macroblock rows */
         /* buf[5] is number of displayed macroblock cols */
 
-        if (16*cols != s->avctx->coded_width ||
+        if (!s->macroblocks || /* first frame */
+            16*cols != s->avctx->coded_width ||
             16*rows != s->avctx->coded_height) {
             avcodec_set_dimensions(s->avctx, 16*cols, 16*rows);
             if (s->avctx->extradata_size == 1) {
@@ -151,7 +152,7 @@ static int vp6_parse_header(vp56_context_t *s, const uint8_t *buf, int buf_size,
     return res;
 }
 
-static void vp6_coeff_order_table_init(vp56_context_t *s)
+static void vp6_coeff_order_table_init(VP56Context *s)
 {
     int i, pos, idx = 1;
 
@@ -162,9 +163,9 @@ static void vp6_coeff_order_table_init(vp56_context_t *s)
                 s->modelp->coeff_index_to_pos[idx++] = pos;
 }
 
-static void vp6_default_models_init(vp56_context_t *s)
+static void vp6_default_models_init(VP56Context *s)
 {
-    vp56_model_t *model = s->modelp;
+    VP56Model *model = s->modelp;
 
     model->vector_dct[0] = 0xA2;
     model->vector_dct[1] = 0xA4;
@@ -180,10 +181,10 @@ static void vp6_default_models_init(vp56_context_t *s)
     vp6_coeff_order_table_init(s);
 }
 
-static void vp6_parse_vector_models(vp56_context_t *s)
+static void vp6_parse_vector_models(VP56Context *s)
 {
-    vp56_range_coder_t *c = &s->c;
-    vp56_model_t *model = s->modelp;
+    VP56RangeCoder *c = &s->c;
+    VP56Model *model = s->modelp;
     int comp, node;
 
     for (comp=0; comp<2; comp++) {
@@ -211,7 +212,7 @@ static int vp6_huff_cmp(const void *va, const void *vb)
     return (a->count - b->count)*16 + (b->sym - a->sym);
 }
 
-static void vp6_build_huff_tree(vp56_context_t *s, uint8_t coeff_model[],
+static void vp6_build_huff_tree(VP56Context *s, uint8_t coeff_model[],
                                 const uint8_t *map, unsigned size, VLC *vlc)
 {
     Node nodes[2*size], *tmp = &nodes[size];
@@ -231,10 +232,10 @@ static void vp6_build_huff_tree(vp56_context_t *s, uint8_t coeff_model[],
                        FF_HUFFMAN_FLAG_HNODE_FIRST);
 }
 
-static void vp6_parse_coeff_models(vp56_context_t *s)
+static void vp6_parse_coeff_models(VP56Context *s)
 {
-    vp56_range_coder_t *c = &s->c;
-    vp56_model_t *model = s->modelp;
+    VP56RangeCoder *c = &s->c;
+    VP56Model *model = s->modelp;
     int def_prob[11];
     int node, cg, ctx, pos;
     int ct;    /* code type */
@@ -296,13 +297,13 @@ static void vp6_parse_coeff_models(vp56_context_t *s)
     }
 }
 
-static void vp6_parse_vector_adjustment(vp56_context_t *s, vp56_mv_t *vect)
+static void vp6_parse_vector_adjustment(VP56Context *s, VP56mv *vect)
 {
-    vp56_range_coder_t *c = &s->c;
-    vp56_model_t *model = s->modelp;
+    VP56RangeCoder *c = &s->c;
+    VP56Model *model = s->modelp;
     int comp;
 
-    *vect = (vp56_mv_t) {0,0};
+    *vect = (VP56mv) {0,0};
     if (s->vector_candidate_pos < 2)
         *vect = s->vector_candidate[0];
 
@@ -338,7 +339,7 @@ static void vp6_parse_vector_adjustment(vp56_context_t *s, vp56_mv_t *vect)
  * Read number of consecutive blocks with null DC or AC.
  * This value is < 74.
  */
-static unsigned vp6_get_nb_null(vp56_context_t *s)
+static unsigned vp6_get_nb_null(VP56Context *s)
 {
     unsigned val = get_bits(&s->gb, 2);
     if (val == 2)
@@ -350,9 +351,9 @@ static unsigned vp6_get_nb_null(vp56_context_t *s)
     return val;
 }
 
-static void vp6_parse_coeff_huffman(vp56_context_t *s)
+static void vp6_parse_coeff_huffman(VP56Context *s)
 {
-    vp56_model_t *model = s->modelp;
+    VP56Model *model = s->modelp;
     uint8_t *permute = s->scantable.permutated;
     VLC *vlc_coeff;
     int coeff, sign, coeff_idx;
@@ -405,10 +406,10 @@ static void vp6_parse_coeff_huffman(vp56_context_t *s)
     }
 }
 
-static void vp6_parse_coeff(vp56_context_t *s)
+static void vp6_parse_coeff(VP56Context *s)
 {
-    vp56_range_coder_t *c = s->ccp;
-    vp56_model_t *model = s->modelp;
+    VP56RangeCoder *c = s->ccp;
+    VP56Model *model = s->modelp;
     uint8_t *permute = s->scantable.permutated;
     uint8_t *model1, *model2, *model3;
     int coeff, sign, coeff_idx;
@@ -522,7 +523,7 @@ static void vp6_filter_hv4(uint8_t *dst, uint8_t *src, int stride,
     }
 }
 
-static void vp6_filter_diag2(vp56_context_t *s, uint8_t *dst, uint8_t *src,
+static void vp6_filter_diag2(VP56Context *s, uint8_t *dst, uint8_t *src,
                              int stride, int h_weight, int v_weight)
 {
     uint8_t *tmp = s->edge_emu_buffer+16;
@@ -563,9 +564,9 @@ static void vp6_filter_diag4(uint8_t *dst, uint8_t *src, int stride,
     }
 }
 
-static void vp6_filter(vp56_context_t *s, uint8_t *dst, uint8_t *src,
+static void vp6_filter(VP56Context *s, uint8_t *dst, uint8_t *src,
                        int offset1, int offset2, int stride,
-                       vp56_mv_t mv, int mask, int select, int luma)
+                       VP56mv mv, int mask, int select, int luma)
 {
     int filter4 = 0;
     int x8 = mv.x & mask;
@@ -615,7 +616,7 @@ static void vp6_filter(vp56_context_t *s, uint8_t *dst, uint8_t *src,
 
 static av_cold int vp6_decode_init(AVCodecContext *avctx)
 {
-    vp56_context_t *s = avctx->priv_data;
+    VP56Context *s = avctx->priv_data;
 
     vp56_init(avctx, avctx->codec->id == CODEC_ID_VP6,
                      avctx->codec->id == CODEC_ID_VP6A);
@@ -635,7 +636,7 @@ AVCodec vp6_decoder = {
     "vp6",
     CODEC_TYPE_VIDEO,
     CODEC_ID_VP6,
-    sizeof(vp56_context_t),
+    sizeof(VP56Context),
     vp6_decode_init,
     NULL,
     vp56_free,
@@ -649,7 +650,7 @@ AVCodec vp6f_decoder = {
     "vp6f",
     CODEC_TYPE_VIDEO,
     CODEC_ID_VP6F,
-    sizeof(vp56_context_t),
+    sizeof(VP56Context),
     vp6_decode_init,
     NULL,
     vp56_free,
@@ -663,7 +664,7 @@ AVCodec vp6a_decoder = {
     "vp6a",
     CODEC_TYPE_VIDEO,
     CODEC_ID_VP6A,
-    sizeof(vp56_context_t),
+    sizeof(VP56Context),
     vp6_decode_init,
     NULL,
     vp56_free,
diff --git a/libavcodec/vp6data.h b/libavcodec/vp6data.h
index 31d6c98..f57115c 100644
--- a/libavcodec/vp6data.h
+++ b/libavcodec/vp6data.h
@@ -283,7 +283,7 @@ static const int16_t vp6_block_copy_filter[17][8][4] = {
     {  -2,  16, 118,  -4  } },
 };
 
-static const vp56_tree_t vp6_pcr_tree[] = {
+static const VP56Tree vp6_pcr_tree[] = {
     { 8, 0},
     { 4, 1},
     { 2, 2}, {-1}, {-2},
diff --git a/libavcodec/wavpack.c b/libavcodec/wavpack.c
index e8703b3..b89723e 100644
--- a/libavcodec/wavpack.c
+++ b/libavcodec/wavpack.c
@@ -361,6 +361,7 @@ static av_cold int wavpack_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     s->stereo = (avctx->channels == 2);
     avctx->sample_fmt = SAMPLE_FMT_S16;
+    avctx->channel_layout = (avctx->channels==2) ? CH_LAYOUT_STEREO : CH_LAYOUT_MONO;
 
     return 0;
 }
diff --git a/libavcodec/wmv2.c b/libavcodec/wmv2.c
index 1e29363..015e3f2 100644
--- a/libavcodec/wmv2.c
+++ b/libavcodec/wmv2.c
@@ -43,12 +43,12 @@ static void wmv2_add_block(Wmv2Context *w, DCTELEM *block1, uint8_t *dst, int st
     case 1:
         ff_simple_idct84_add(dst           , stride, block1);
         ff_simple_idct84_add(dst + 4*stride, stride, w->abt_block2[n]);
-        memset(w->abt_block2[n], 0, 64*sizeof(DCTELEM));
+        s->dsp.clear_block(w->abt_block2[n]);
         break;
     case 2:
         ff_simple_idct48_add(dst           , stride, block1);
         ff_simple_idct48_add(dst + 4       , stride, w->abt_block2[n]);
-        memset(w->abt_block2[n], 0, 64*sizeof(DCTELEM));
+        s->dsp.clear_block(w->abt_block2[n]);
         break;
     default:
         av_log(s->avctx, AV_LOG_ERROR, "internal error in WMV2 abt\n");
diff --git a/libavcodec/x86/cavsdsp_mmx.c b/libavcodec/x86/cavsdsp_mmx.c
new file mode 100644
index 0000000..2000ba5
--- /dev/null
+++ b/libavcodec/x86/cavsdsp_mmx.c
@@ -0,0 +1,497 @@
+/*
+ * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
+ * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer at gmx.de>
+ *
+ * MMX-optimized DSP functions, based on H.264 optimizations by
+ * Michael Niedermayer and Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+#include "dsputil_mmx.h"
+
+/*****************************************************************************
+ *
+ * inverse transform
+ *
+ ****************************************************************************/
+
+static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
+{
+    __asm__ volatile(
+        "movq 112(%0), %%mm4  \n\t" /* mm4 = src7 */
+        "movq  16(%0), %%mm5  \n\t" /* mm5 = src1 */
+        "movq  80(%0), %%mm2  \n\t" /* mm2 = src5 */
+        "movq  48(%0), %%mm7  \n\t" /* mm7 = src3 */
+        "movq   %%mm4, %%mm0  \n\t"
+        "movq   %%mm5, %%mm3  \n\t"
+        "movq   %%mm2, %%mm6  \n\t"
+        "movq   %%mm7, %%mm1  \n\t"
+
+        "paddw  %%mm4, %%mm4  \n\t" /* mm4 = 2*src7 */
+        "paddw  %%mm3, %%mm3  \n\t" /* mm3 = 2*src1 */
+        "paddw  %%mm6, %%mm6  \n\t" /* mm6 = 2*src5 */
+        "paddw  %%mm1, %%mm1  \n\t" /* mm1 = 2*src3 */
+        "paddw  %%mm4, %%mm0  \n\t" /* mm0 = 3*src7 */
+        "paddw  %%mm3, %%mm5  \n\t" /* mm5 = 3*src1 */
+        "paddw  %%mm6, %%mm2  \n\t" /* mm2 = 3*src5 */
+        "paddw  %%mm1, %%mm7  \n\t" /* mm7 = 3*src3 */
+        "psubw  %%mm4, %%mm5  \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
+        "paddw  %%mm6, %%mm7  \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
+        "psubw  %%mm2, %%mm1  \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
+        "paddw  %%mm0, %%mm3  \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
+
+        "movq   %%mm5, %%mm4  \n\t"
+        "movq   %%mm7, %%mm6  \n\t"
+        "movq   %%mm3, %%mm0  \n\t"
+        "movq   %%mm1, %%mm2  \n\t"
+        SUMSUB_BA( %%mm7, %%mm5 )   /* mm7 = a0 + a1  mm5 = a0 - a1 */
+        "paddw  %%mm3, %%mm7  \n\t" /* mm7 = a0 + a1 + a3 */
+        "paddw  %%mm1, %%mm5  \n\t" /* mm5 = a0 - a1 + a2 */
+        "paddw  %%mm7, %%mm7  \n\t"
+        "paddw  %%mm5, %%mm5  \n\t"
+        "paddw  %%mm6, %%mm7  \n\t" /* mm7 = b4 */
+        "paddw  %%mm4, %%mm5  \n\t" /* mm5 = b5 */
+
+        SUMSUB_BA( %%mm1, %%mm3 )   /* mm1 = a3 + a2  mm3 = a3 - a2 */
+        "psubw  %%mm1, %%mm4  \n\t" /* mm4 = a0 - a2 - a3 */
+        "movq   %%mm4, %%mm1  \n\t" /* mm1 = a0 - a2 - a3 */
+        "psubw  %%mm6, %%mm3  \n\t" /* mm3 = a3 - a2 - a1 */
+        "paddw  %%mm1, %%mm1  \n\t"
+        "paddw  %%mm3, %%mm3  \n\t"
+        "psubw  %%mm2, %%mm1  \n\t" /* mm1 = b7 */
+        "paddw  %%mm0, %%mm3  \n\t" /* mm3 = b6 */
+
+        "movq  32(%0), %%mm2  \n\t" /* mm2 = src2 */
+        "movq  96(%0), %%mm6  \n\t" /* mm6 = src6 */
+        "movq   %%mm2, %%mm4  \n\t"
+        "movq   %%mm6, %%mm0  \n\t"
+        "psllw  $2,    %%mm4  \n\t" /* mm4 = 4*src2 */
+        "psllw  $2,    %%mm6  \n\t" /* mm6 = 4*src6 */
+        "paddw  %%mm4, %%mm2  \n\t" /* mm2 = 5*src2 */
+        "paddw  %%mm6, %%mm0  \n\t" /* mm0 = 5*src6 */
+        "paddw  %%mm2, %%mm2  \n\t"
+        "paddw  %%mm0, %%mm0  \n\t"
+        "psubw  %%mm0, %%mm4  \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
+        "paddw  %%mm2, %%mm6  \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
+
+        "movq    (%0), %%mm2  \n\t" /* mm2 = src0 */
+        "movq  64(%0), %%mm0  \n\t" /* mm0 = src4 */
+        SUMSUB_BA( %%mm0, %%mm2 )   /* mm0 = src0+src4  mm2 = src0-src4 */
+        "psllw  $3,    %%mm0  \n\t"
+        "psllw  $3,    %%mm2  \n\t"
+        "paddw  %1,    %%mm0  \n\t" /* add rounding bias */
+        "paddw  %1,    %%mm2  \n\t" /* add rounding bias */
+
+        SUMSUB_BA( %%mm6, %%mm0 )   /* mm6 = a4 + a6  mm0 = a4 - a6 */
+        SUMSUB_BA( %%mm4, %%mm2 )   /* mm4 = a5 + a7  mm2 = a5 - a7 */
+        SUMSUB_BA( %%mm7, %%mm6 )   /* mm7 = dst0  mm6 = dst7 */
+        SUMSUB_BA( %%mm5, %%mm4 )   /* mm5 = dst1  mm4 = dst6 */
+        SUMSUB_BA( %%mm3, %%mm2 )   /* mm3 = dst2  mm2 = dst5 */
+        SUMSUB_BA( %%mm1, %%mm0 )   /* mm1 = dst3  mm0 = dst4 */
+        :: "r"(block), "m"(bias)
+    );
+}
+
+static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+    int i;
+    DECLARE_ALIGNED_8(int16_t, b2[64]);
+
+    for(i=0; i<2; i++){
+        DECLARE_ALIGNED_8(uint64_t, tmp);
+
+        cavs_idct8_1d(block+4*i, ff_pw_4);
+
+        __asm__ volatile(
+            "psraw     $3, %%mm7  \n\t"
+            "psraw     $3, %%mm6  \n\t"
+            "psraw     $3, %%mm5  \n\t"
+            "psraw     $3, %%mm4  \n\t"
+            "psraw     $3, %%mm3  \n\t"
+            "psraw     $3, %%mm2  \n\t"
+            "psraw     $3, %%mm1  \n\t"
+            "psraw     $3, %%mm0  \n\t"
+            "movq   %%mm7,    %0   \n\t"
+            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
+            "movq   %%mm0,  8(%1)  \n\t"
+            "movq   %%mm6, 24(%1)  \n\t"
+            "movq   %%mm7, 40(%1)  \n\t"
+            "movq   %%mm4, 56(%1)  \n\t"
+            "movq    %0,    %%mm7  \n\t"
+            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
+            "movq   %%mm7,   (%1)  \n\t"
+            "movq   %%mm1, 16(%1)  \n\t"
+            "movq   %%mm0, 32(%1)  \n\t"
+            "movq   %%mm3, 48(%1)  \n\t"
+            : "=m"(tmp)
+            : "r"(b2+32*i)
+            : "memory"
+        );
+    }
+
+    for(i=0; i<2; i++){
+        cavs_idct8_1d(b2+4*i, ff_pw_64);
+
+        __asm__ volatile(
+            "psraw     $7, %%mm7  \n\t"
+            "psraw     $7, %%mm6  \n\t"
+            "psraw     $7, %%mm5  \n\t"
+            "psraw     $7, %%mm4  \n\t"
+            "psraw     $7, %%mm3  \n\t"
+            "psraw     $7, %%mm2  \n\t"
+            "psraw     $7, %%mm1  \n\t"
+            "psraw     $7, %%mm0  \n\t"
+            "movq   %%mm7,    (%0)  \n\t"
+            "movq   %%mm5,  16(%0)  \n\t"
+            "movq   %%mm3,  32(%0)  \n\t"
+            "movq   %%mm1,  48(%0)  \n\t"
+            "movq   %%mm0,  64(%0)  \n\t"
+            "movq   %%mm2,  80(%0)  \n\t"
+            "movq   %%mm4,  96(%0)  \n\t"
+            "movq   %%mm6, 112(%0)  \n\t"
+            :: "r"(b2+4*i)
+            : "memory"
+        );
+    }
+
+    add_pixels_clamped_mmx(b2, dst, stride);
+
+    /* clear block */
+    __asm__ volatile(
+            "pxor %%mm7, %%mm7   \n\t"
+            "movq %%mm7, (%0)    \n\t"
+            "movq %%mm7, 8(%0)   \n\t"
+            "movq %%mm7, 16(%0)  \n\t"
+            "movq %%mm7, 24(%0)  \n\t"
+            "movq %%mm7, 32(%0)  \n\t"
+            "movq %%mm7, 40(%0)  \n\t"
+            "movq %%mm7, 48(%0)  \n\t"
+            "movq %%mm7, 56(%0)  \n\t"
+            "movq %%mm7, 64(%0)  \n\t"
+            "movq %%mm7, 72(%0)  \n\t"
+            "movq %%mm7, 80(%0)  \n\t"
+            "movq %%mm7, 88(%0)  \n\t"
+            "movq %%mm7, 96(%0)  \n\t"
+            "movq %%mm7, 104(%0) \n\t"
+            "movq %%mm7, 112(%0) \n\t"
+            "movq %%mm7, 120(%0) \n\t"
+            :: "r" (block)
+    );
+}
+
+/*****************************************************************************
+ *
+ * motion compensation
+ *
+ ****************************************************************************/
+
+/* vertical filter [-1 -2 96 42 -7  0]  */
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP)      \
+        "movd (%0), "#F"            \n\t"\
+        "movq "#C", %%mm6           \n\t"\
+        "pmullw %5, %%mm6           \n\t"\
+        "movq "#D", %%mm7           \n\t"\
+        "pmullw %6, %%mm7           \n\t"\
+        "psllw $3, "#E"             \n\t"\
+        "psubw "#E", %%mm6          \n\t"\
+        "psraw $3, "#E"             \n\t"\
+        "paddw %%mm7, %%mm6         \n\t"\
+        "paddw "#E", %%mm6          \n\t"\
+        "paddw "#B", "#B"           \n\t"\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, "#F"      \n\t"\
+        "psubw "#B", %%mm6          \n\t"\
+        "psraw $1, "#B"             \n\t"\
+        "psubw "#A", %%mm6          \n\t"\
+        "paddw %4, %%mm6            \n\t"\
+        "psraw $7, %%mm6            \n\t"\
+        "packuswb %%mm6, %%mm6      \n\t"\
+        OP(%%mm6, (%1), A, d)            \
+        "add %3, %1                 \n\t"
+
+/* vertical filter [ 0 -1  5  5 -1  0]  */
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP)      \
+        "movd (%0), "#F"            \n\t"\
+        "movq "#C", %%mm6           \n\t"\
+        "paddw "#D", %%mm6          \n\t"\
+        "pmullw %5, %%mm6           \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, "#F"      \n\t"\
+        "psubw "#B", %%mm6          \n\t"\
+        "psubw "#E", %%mm6          \n\t"\
+        "paddw %4, %%mm6            \n\t"\
+        "psraw $3, %%mm6            \n\t"\
+        "packuswb %%mm6, %%mm6      \n\t"\
+        OP(%%mm6, (%1), A, d)            \
+        "add %3, %1                 \n\t"
+
+/* vertical filter [ 0 -7 42 96 -2 -1]  */
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP)      \
+        "movd (%0), "#F"            \n\t"\
+        "movq "#C", %%mm6           \n\t"\
+        "pmullw %6, %%mm6           \n\t"\
+        "movq "#D", %%mm7           \n\t"\
+        "pmullw %5, %%mm7           \n\t"\
+        "psllw $3, "#B"             \n\t"\
+        "psubw "#B", %%mm6          \n\t"\
+        "psraw $3, "#B"             \n\t"\
+        "paddw %%mm7, %%mm6         \n\t"\
+        "paddw "#B", %%mm6          \n\t"\
+        "paddw "#E", "#E"           \n\t"\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, "#F"      \n\t"\
+        "psubw "#E", %%mm6          \n\t"\
+        "psraw $1, "#E"             \n\t"\
+        "psubw "#F", %%mm6          \n\t"\
+        "paddw %4, %%mm6            \n\t"\
+        "psraw $7, %%mm6            \n\t"\
+        "packuswb %%mm6, %%mm6      \n\t"\
+        OP(%%mm6, (%1), A, d)            \
+        "add %3, %1                 \n\t"
+
+
+#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
+    int w= 2;\
+    src -= 2*srcStride;\
+    \
+    while(w--){\
+      __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movd (%0), %%mm0           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm1           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm2           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm3           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm4           \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpcklbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+        \
+        : "+a"(src), "+c"(dst)\
+        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\
+        : "memory"\
+     );\
+     if(h==16){\
+        __asm__ volatile(\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+            \
+           : "+a"(src), "+c"(dst)\
+           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD),  "m"(MUL1), "m"(MUL2)\
+           : "memory"\
+        );\
+     }\
+     src += 4-(h+5)*srcStride;\
+     dst += 4-h*dstStride;\
+   }
+
+#define QPEL_CAVS(OPNAME, OP, MMX)\
+static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    int h=8;\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movq %5, %%mm6             \n\t"\
+        "1:                         \n\t"\
+        "movq    (%0), %%mm0        \n\t"\
+        "movq   1(%0), %%mm2        \n\t"\
+        "movq %%mm0, %%mm1          \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpckhbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm3, %%mm1         \n\t"\
+        "pmullw %%mm6, %%mm0        \n\t"\
+        "pmullw %%mm6, %%mm1        \n\t"\
+        "movq   -1(%0), %%mm2       \n\t"\
+        "movq    2(%0), %%mm4       \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "movq %%mm4, %%mm5          \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        "punpckhbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm4, %%mm2         \n\t"\
+        "paddw %%mm3, %%mm5         \n\t"\
+        "psubw %%mm2, %%mm0         \n\t"\
+        "psubw %%mm5, %%mm1         \n\t"\
+        "movq %6, %%mm5             \n\t"\
+        "paddw %%mm5, %%mm0         \n\t"\
+        "paddw %%mm5, %%mm1         \n\t"\
+        "psraw $3, %%mm0            \n\t"\
+        "psraw $3, %%mm1            \n\t"\
+        "packuswb %%mm1, %%mm0      \n\t"\
+        OP(%%mm0, (%1),%%mm5, q)         \
+        "add %3, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(src), "+c"(dst), "+m"(h)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
+        : "memory"\
+    );\
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+  QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5)         \
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+  QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
+}\
+\
+static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define CAVS_MC(OPNAME, SIZE, MMX) \
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
+}\
+
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "    \n\t"
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgusb " #temp ", " #a "        \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgb " #temp ", " #a "          \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+
+QPEL_CAVS(put_,       PUT_OP, 3dnow)
+QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
+QPEL_CAVS(put_,       PUT_OP, mmx2)
+QPEL_CAVS(avg_,  AVG_MMX2_OP, mmx2)
+
+CAVS_MC(put_, 8, 3dnow)
+CAVS_MC(put_, 16,3dnow)
+CAVS_MC(avg_, 8, 3dnow)
+CAVS_MC(avg_, 16,3dnow)
+CAVS_MC(put_, 8, mmx2)
+CAVS_MC(put_, 16,mmx2)
+CAVS_MC(avg_, 8, mmx2)
+CAVS_MC(avg_, 16,mmx2)
+
+void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+
+void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx) {
+#define dspfunc(PFX, IDX, NUM) \
+    c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \
+    c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \
+
+    dspfunc(put_cavs_qpel, 0, 16);
+    dspfunc(put_cavs_qpel, 1, 8);
+    dspfunc(avg_cavs_qpel, 0, 16);
+    dspfunc(avg_cavs_qpel, 1, 8);
+#undef dspfunc
+    c->cavs_idct8_add = cavs_idct8_add_mmx;
+}
+
+void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx) {
+#define dspfunc(PFX, IDX, NUM) \
+    c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \
+    c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \
+    c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \
+    c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \
+
+    dspfunc(put_cavs_qpel, 0, 16);
+    dspfunc(put_cavs_qpel, 1, 8);
+    dspfunc(avg_cavs_qpel, 0, 16);
+    dspfunc(avg_cavs_qpel, 1, 8);
+#undef dspfunc
+    c->cavs_idct8_add = cavs_idct8_add_mmx;
+}
diff --git a/libavcodec/x86/cpuid.c b/libavcodec/x86/cpuid.c
new file mode 100644
index 0000000..664bac3
--- /dev/null
+++ b/libavcodec/x86/cpuid.c
@@ -0,0 +1,134 @@
+/*
+ * CPU detection code, extracted from mmx.h
+ * (c)1997-99 by H. Dietz and R. Fisher
+ * Converted to C and improved by Fabrice Bellard.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+
+#undef printf
+
+/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
+#define cpuid(index,eax,ebx,ecx,edx)\
+    __asm__ volatile\
+        ("mov %%"REG_b", %%"REG_S"\n\t"\
+         "cpuid\n\t"\
+         "xchg %%"REG_b", %%"REG_S\
+         : "=a" (eax), "=S" (ebx),\
+           "=c" (ecx), "=d" (edx)\
+         : "0" (index));
+
+/* Function to test if multimedia instructions are supported...  */
+int mm_support(void)
+{
+    int rval = 0;
+    int eax, ebx, ecx, edx;
+    int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
+    x86_reg a, c;
+
+#ifdef ARCH_X86_64
+#define PUSHF "pushfq\n\t"
+#define POPF "popfq\n\t"
+#else
+#define PUSHF "pushfl\n\t"
+#define POPF "popfl\n\t"
+#endif
+    __asm__ volatile (
+        /* See if CPUID instruction is supported ... */
+        /* ... Get copies of EFLAGS into eax and ecx */
+        PUSHF
+        "pop %0\n\t"
+        "mov %0, %1\n\t"
+
+        /* ... Toggle the ID bit in one copy and store */
+        /*     to the EFLAGS reg */
+        "xor $0x200000, %0\n\t"
+        "push %0\n\t"
+        POPF
+
+        /* ... Get the (hopefully modified) EFLAGS */
+        PUSHF
+        "pop %0\n\t"
+        : "=a" (a), "=c" (c)
+        :
+        : "cc"
+        );
+
+    if (a == c)
+        return 0; /* CPUID not supported */
+
+    cpuid(0, max_std_level, ebx, ecx, edx);
+
+    if(max_std_level >= 1){
+        cpuid(1, eax, ebx, ecx, std_caps);
+        if (std_caps & (1<<23))
+            rval |= FF_MM_MMX;
+        if (std_caps & (1<<25))
+            rval |= FF_MM_MMXEXT
+#ifdef HAVE_SSE
+                  | FF_MM_SSE;
+        if (std_caps & (1<<26))
+            rval |= FF_MM_SSE2;
+        if (ecx & 1)
+            rval |= FF_MM_SSE3;
+        if (ecx & 0x00000200 )
+            rval |= FF_MM_SSSE3
+#endif
+                  ;
+    }
+
+    cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
+
+    if(max_ext_level >= 0x80000001){
+        cpuid(0x80000001, eax, ebx, ecx, ext_caps);
+        if (ext_caps & (1<<31))
+            rval |= FF_MM_3DNOW;
+        if (ext_caps & (1<<30))
+            rval |= FF_MM_3DNOWEXT;
+        if (ext_caps & (1<<23))
+            rval |= FF_MM_MMX;
+        if (ext_caps & (1<<22))
+            rval |= FF_MM_MMXEXT;
+    }
+
+#if 0
+    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s\n",
+        (rval&FF_MM_MMX) ? "MMX ":"",
+        (rval&FF_MM_MMXEXT) ? "MMX2 ":"",
+        (rval&FF_MM_SSE) ? "SSE ":"",
+        (rval&FF_MM_SSE2) ? "SSE2 ":"",
+        (rval&FF_MM_SSE3) ? "SSE3 ":"",
+        (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
+        (rval&FF_MM_3DNOW) ? "3DNow ":"",
+        (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":"");
+#endif
+    return rval;
+}
+
+#ifdef TEST
+int main ( void )
+{
+    int mm_flags;
+    mm_flags = mm_support();
+    printf("mm_support = 0x%08X\n",mm_flags);
+    return 0;
+}
+#endif
diff --git a/libavcodec/x86/dnxhd_mmx.c b/libavcodec/x86/dnxhd_mmx.c
new file mode 100644
index 0000000..59bcb39
--- /dev/null
+++ b/libavcodec/x86/dnxhd_mmx.c
@@ -0,0 +1,58 @@
+/*
+ * VC3/DNxHD SIMD functions
+ * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
+ *
+ * VC-3 encoder funded by the British Broadcasting Corporation
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dnxhdenc.h"
+
+static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
+{
+    __asm__ volatile(
+        "pxor %%xmm7,      %%xmm7       \n\t"
+        "movq (%0),        %%xmm0       \n\t"
+        "add  %2,          %0           \n\t"
+        "movq (%0),        %%xmm1       \n\t"
+        "movq (%0, %2),    %%xmm2       \n\t"
+        "movq (%0, %2,2),  %%xmm3       \n\t"
+        "punpcklbw %%xmm7, %%xmm0       \n\t"
+        "punpcklbw %%xmm7, %%xmm1       \n\t"
+        "punpcklbw %%xmm7, %%xmm2       \n\t"
+        "punpcklbw %%xmm7, %%xmm3       \n\t"
+        "movdqa %%xmm0,      (%1)       \n\t"
+        "movdqa %%xmm1,    16(%1)       \n\t"
+        "movdqa %%xmm2,    32(%1)       \n\t"
+        "movdqa %%xmm3,    48(%1)       \n\t"
+        "movdqa %%xmm3 ,   64(%1)       \n\t"
+        "movdqa %%xmm2 ,   80(%1)       \n\t"
+        "movdqa %%xmm1 ,   96(%1)       \n\t"
+        "movdqa %%xmm0,   112(%1)       \n\t"
+        : "+r" (pixels)
+        : "r" (block), "r" ((x86_reg)line_size)
+    );
+}
+
+void ff_dnxhd_init_mmx(DNXHDEncContext *ctx)
+{
+    if (mm_flags & FF_MM_SSE2) {
+        ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
+    }
+}
diff --git a/libavcodec/x86/dsputil_h264_template_mmx.c b/libavcodec/x86/dsputil_h264_template_mmx.c
new file mode 100644
index 0000000..43f4393
--- /dev/null
+++ b/libavcodec/x86/dsputil_h264_template_mmx.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
+ *                    Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MMX optimized version of (put|avg)_h264_chroma_mc8.
+ * H264_CHROMA_MC8_TMPL must be defined to the desired function name
+ * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
+ * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
+ */
+static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
+{
+    DECLARE_ALIGNED_8(uint64_t, AA);
+    DECLARE_ALIGNED_8(uint64_t, DD);
+    int i;
+
+    if(y==0 && x==0) {
+        /* no filter needed */
+        H264_CHROMA_MC8_MV0(dst, src, stride, h);
+        return;
+    }
+
+    assert(x<8 && y<8 && x>=0 && y>=0);
+
+    if(y==0 || x==0)
+    {
+        /* 1 dimensional filter only */
+        const int dxy = x ? 1 : stride;
+
+        __asm__ volatile(
+            "movd %0, %%mm5\n\t"
+            "movq %1, %%mm4\n\t"
+            "movq %2, %%mm6\n\t"         /* mm6 = rnd >> 3 */
+            "punpcklwd %%mm5, %%mm5\n\t"
+            "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
+            "pxor %%mm7, %%mm7\n\t"
+            "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
+            :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1)));
+
+        for(i=0; i<h; i++) {
+            __asm__ volatile(
+                /* mm0 = src[0..7], mm1 = src[1..8] */
+                "movq %0, %%mm0\n\t"
+                "movq %1, %%mm2\n\t"
+                :: "m"(src[0]), "m"(src[dxy]));
+
+            __asm__ volatile(
+                /* [mm0,mm1] = A * src[0..7] */
+                /* [mm2,mm3] = B * src[1..8] */
+                "movq %%mm0, %%mm1\n\t"
+                "movq %%mm2, %%mm3\n\t"
+                "punpcklbw %%mm7, %%mm0\n\t"
+                "punpckhbw %%mm7, %%mm1\n\t"
+                "punpcklbw %%mm7, %%mm2\n\t"
+                "punpckhbw %%mm7, %%mm3\n\t"
+                "pmullw %%mm4, %%mm0\n\t"
+                "pmullw %%mm4, %%mm1\n\t"
+                "pmullw %%mm5, %%mm2\n\t"
+                "pmullw %%mm5, %%mm3\n\t"
+
+                /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */
+                "paddw %%mm6, %%mm0\n\t"
+                "paddw %%mm6, %%mm1\n\t"
+                "paddw %%mm2, %%mm0\n\t"
+                "paddw %%mm3, %%mm1\n\t"
+                "psrlw $3, %%mm0\n\t"
+                "psrlw $3, %%mm1\n\t"
+                "packuswb %%mm1, %%mm0\n\t"
+                H264_CHROMA_OP(%0, %%mm0)
+                "movq %%mm0, %0\n\t"
+                : "=m" (dst[0]));
+
+            src += stride;
+            dst += stride;
+        }
+        return;
+    }
+
+    /* general case, bilinear */
+    __asm__ volatile("movd %2, %%mm4\n\t"
+                 "movd %3, %%mm6\n\t"
+                 "punpcklwd %%mm4, %%mm4\n\t"
+                 "punpcklwd %%mm6, %%mm6\n\t"
+                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
+                 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
+                 "movq %%mm4, %%mm5\n\t"
+                 "pmullw %%mm6, %%mm4\n\t"    /* mm4 = x * y */
+                 "psllw $3, %%mm5\n\t"
+                 "psllw $3, %%mm6\n\t"
+                 "movq %%mm5, %%mm7\n\t"
+                 "paddw %%mm6, %%mm7\n\t"
+                 "movq %%mm4, %1\n\t"         /* DD = x * y */
+                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
+                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
+                 "paddw %4, %%mm4\n\t"
+                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
+                 "pxor %%mm7, %%mm7\n\t"
+                 "movq %%mm4, %0\n\t"
+                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
+
+    __asm__ volatile(
+        /* mm0 = src[0..7], mm1 = src[1..8] */
+        "movq %0, %%mm0\n\t"
+        "movq %1, %%mm1\n\t"
+        : : "m" (src[0]), "m" (src[1]));
+
+    for(i=0; i<h; i++) {
+        src += stride;
+
+        __asm__ volatile(
+            /* mm2 = A * src[0..3] + B * src[1..4] */
+            /* mm3 = A * src[4..7] + B * src[5..8] */
+            "movq %%mm0, %%mm2\n\t"
+            "movq %%mm1, %%mm3\n\t"
+            "punpckhbw %%mm7, %%mm0\n\t"
+            "punpcklbw %%mm7, %%mm1\n\t"
+            "punpcklbw %%mm7, %%mm2\n\t"
+            "punpckhbw %%mm7, %%mm3\n\t"
+            "pmullw %0, %%mm0\n\t"
+            "pmullw %0, %%mm2\n\t"
+            "pmullw %%mm5, %%mm1\n\t"
+            "pmullw %%mm5, %%mm3\n\t"
+            "paddw %%mm1, %%mm2\n\t"
+            "paddw %%mm0, %%mm3\n\t"
+            : : "m" (AA));
+
+        __asm__ volatile(
+            /* [mm2,mm3] += C * src[0..7] */
+            "movq %0, %%mm0\n\t"
+            "movq %%mm0, %%mm1\n\t"
+            "punpcklbw %%mm7, %%mm0\n\t"
+            "punpckhbw %%mm7, %%mm1\n\t"
+            "pmullw %%mm6, %%mm0\n\t"
+            "pmullw %%mm6, %%mm1\n\t"
+            "paddw %%mm0, %%mm2\n\t"
+            "paddw %%mm1, %%mm3\n\t"
+            : : "m" (src[0]));
+
+        __asm__ volatile(
+            /* [mm2,mm3] += D * src[1..8] */
+            "movq %1, %%mm1\n\t"
+            "movq %%mm1, %%mm0\n\t"
+            "movq %%mm1, %%mm4\n\t"
+            "punpcklbw %%mm7, %%mm0\n\t"
+            "punpckhbw %%mm7, %%mm4\n\t"
+            "pmullw %2, %%mm0\n\t"
+            "pmullw %2, %%mm4\n\t"
+            "paddw %%mm0, %%mm2\n\t"
+            "paddw %%mm4, %%mm3\n\t"
+            "movq %0, %%mm0\n\t"
+            : : "m" (src[0]), "m" (src[1]), "m" (DD));
+
+        __asm__ volatile(
+            /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */
+            "paddw %1, %%mm2\n\t"
+            "paddw %1, %%mm3\n\t"
+            "psrlw $6, %%mm2\n\t"
+            "psrlw $6, %%mm3\n\t"
+            "packuswb %%mm3, %%mm2\n\t"
+            H264_CHROMA_OP(%0, %%mm2)
+            "movq %%mm2, %0\n\t"
+            : "=m" (dst[0]) : "m" (*rnd_reg));
+        dst+= stride;
+    }
+}
+
+static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
+{
+    __asm__ volatile(
+        "pxor   %%mm7, %%mm7        \n\t"
+        "movd %5, %%mm2             \n\t"
+        "movd %6, %%mm3             \n\t"
+        "movq "MANGLE(ff_pw_8)", %%mm4\n\t"
+        "movq "MANGLE(ff_pw_8)", %%mm5\n\t"
+        "punpcklwd %%mm2, %%mm2     \n\t"
+        "punpcklwd %%mm3, %%mm3     \n\t"
+        "punpcklwd %%mm2, %%mm2     \n\t"
+        "punpcklwd %%mm3, %%mm3     \n\t"
+        "psubw %%mm2, %%mm4         \n\t"
+        "psubw %%mm3, %%mm5         \n\t"
+
+        "movd  (%1), %%mm0          \n\t"
+        "movd 1(%1), %%mm6          \n\t"
+        "add %3, %1                 \n\t"
+        "punpcklbw %%mm7, %%mm0     \n\t"
+        "punpcklbw %%mm7, %%mm6     \n\t"
+        "pmullw %%mm4, %%mm0        \n\t"
+        "pmullw %%mm2, %%mm6        \n\t"
+        "paddw %%mm0, %%mm6         \n\t"
+
+        "1:                         \n\t"
+        "movd  (%1), %%mm0          \n\t"
+        "movd 1(%1), %%mm1          \n\t"
+        "add %3, %1                 \n\t"
+        "punpcklbw %%mm7, %%mm0     \n\t"
+        "punpcklbw %%mm7, %%mm1     \n\t"
+        "pmullw %%mm4, %%mm0        \n\t"
+        "pmullw %%mm2, %%mm1        \n\t"
+        "paddw %%mm0, %%mm1         \n\t"
+        "movq %%mm1, %%mm0          \n\t"
+        "pmullw %%mm5, %%mm6        \n\t"
+        "pmullw %%mm3, %%mm1        \n\t"
+        "paddw %4, %%mm6            \n\t"
+        "paddw %%mm6, %%mm1         \n\t"
+        "psrlw $6, %%mm1            \n\t"
+        "packuswb %%mm1, %%mm1      \n\t"
+        H264_CHROMA_OP4((%0), %%mm1, %%mm6)
+        "movd %%mm1, (%0)           \n\t"
+        "add %3, %0                 \n\t"
+        "movd  (%1), %%mm6          \n\t"
+        "movd 1(%1), %%mm1          \n\t"
+        "add %3, %1                 \n\t"
+        "punpcklbw %%mm7, %%mm6     \n\t"
+        "punpcklbw %%mm7, %%mm1     \n\t"
+        "pmullw %%mm4, %%mm6        \n\t"
+        "pmullw %%mm2, %%mm1        \n\t"
+        "paddw %%mm6, %%mm1         \n\t"
+        "movq %%mm1, %%mm6          \n\t"
+        "pmullw %%mm5, %%mm0        \n\t"
+        "pmullw %%mm3, %%mm1        \n\t"
+        "paddw %4, %%mm0            \n\t"
+        "paddw %%mm0, %%mm1         \n\t"
+        "psrlw $6, %%mm1            \n\t"
+        "packuswb %%mm1, %%mm1      \n\t"
+        H264_CHROMA_OP4((%0), %%mm1, %%mm0)
+        "movd %%mm1, (%0)           \n\t"
+        "add %3, %0                 \n\t"
+        "sub $2, %2                 \n\t"
+        "jnz 1b                     \n\t"
+        : "+r"(dst), "+r"(src), "+r"(h)
+        : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y)
+    );
+}
+
+#ifdef H264_CHROMA_MC2_TMPL
+static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    int tmp = ((1<<16)-1)*x + 8;
+    int CD= tmp*y;
+    int AB= (tmp<<3) - CD;
+    __asm__ volatile(
+        /* mm5 = {A,B,A,B} */
+        /* mm6 = {C,D,C,D} */
+        "movd %0, %%mm5\n\t"
+        "movd %1, %%mm6\n\t"
+        "punpckldq %%mm5, %%mm5\n\t"
+        "punpckldq %%mm6, %%mm6\n\t"
+        "pxor %%mm7, %%mm7\n\t"
+        /* mm0 = src[0,1,1,2] */
+        "movd %2, %%mm2\n\t"
+        "punpcklbw %%mm7, %%mm2\n\t"
+        "pshufw $0x94, %%mm2, %%mm2\n\t"
+        :: "r"(AB), "r"(CD), "m"(src[0]));
+
+
+    __asm__ volatile(
+        "1:\n\t"
+        "add %4, %1\n\t"
+        /* mm1 = A * src[0,1] + B * src[1,2] */
+        "movq    %%mm2, %%mm1\n\t"
+        "pmaddwd %%mm5, %%mm1\n\t"
+        /* mm0 = src[0,1,1,2] */
+        "movd (%1), %%mm0\n\t"
+        "punpcklbw %%mm7, %%mm0\n\t"
+        "pshufw $0x94, %%mm0, %%mm0\n\t"
+        /* mm1 += C * src[0,1] + D * src[1,2] */
+        "movq    %%mm0, %%mm2\n\t"
+        "pmaddwd %%mm6, %%mm0\n\t"
+        "paddw      %3, %%mm1\n\t"
+        "paddw   %%mm0, %%mm1\n\t"
+        /* dst[0,1] = pack((mm1 + 32) >> 6) */
+        "psrlw $6, %%mm1\n\t"
+        "packssdw %%mm7, %%mm1\n\t"
+        "packuswb %%mm7, %%mm1\n\t"
+        H264_CHROMA_OP4((%0), %%mm1, %%mm3)
+        "movd %%mm1, %%esi\n\t"
+        "movw %%si, (%0)\n\t"
+        "add %4, %0\n\t"
+        "sub $1, %2\n\t"
+        "jnz 1b\n\t"
+        : "+r" (dst), "+r"(src), "+r"(h)
+        : "m" (ff_pw_32), "r"((x86_reg)stride)
+        : "%esi");
+
+}
+#endif
+
diff --git a/libavcodec/i386/dsputil_h264_template_ssse3.c b/libavcodec/x86/dsputil_h264_template_ssse3.c
similarity index 100%
rename from libavcodec/i386/dsputil_h264_template_ssse3.c
rename to libavcodec/x86/dsputil_h264_template_ssse3.c
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
new file mode 100644
index 0000000..1d93351
--- /dev/null
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -0,0 +1,2993 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/h263.h"
+#include "libavcodec/mpegvideo.h"
+#include "libavcodec/simple_idct.h"
+#include "dsputil_mmx.h"
+#include "mmx.h"
+#include "vp3dsp_mmx.h"
+#include "vp3dsp_sse2.h"
+#include "idct_xvid.h"
+
+//#undef NDEBUG
+//#include <assert.h>
+
+int mm_flags; /* multimedia extension flags */
+
+/* pixel operations */
+DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
+
+DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
+{0x8000000080000000ULL, 0x8000000080000000ULL};
+
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
+DECLARE_ALIGNED_16(const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
+DECLARE_ALIGNED_16(const xmm_reg,  ff_pw_8  ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
+DECLARE_ALIGNED_16(const xmm_reg,  ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
+DECLARE_ALIGNED_16(const xmm_reg,  ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
+DECLARE_ALIGNED_16(const xmm_reg,  ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
+
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
+DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
+
+#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
+#define MOVQ_ZERO(regd)  __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
+
+#define MOVQ_BFE(regd) \
+    __asm__ volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
+    "paddb %%" #regd ", %%" #regd " \n\t" ::)
+
+#ifndef PIC
+#define MOVQ_BONE(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
+#define MOVQ_WTWO(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
+#else
+// for shared library it's better to use this way for accessing constants
+// pcmpeqd -> -1
+#define MOVQ_BONE(regd) \
+    __asm__ volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd " \n\t" \
+    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
+
+#define MOVQ_WTWO(regd) \
+    __asm__ volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd " \n\t" \
+    "psllw $1, %%" #regd " \n\t"::)
+
+#endif
+
+// using regr as temporary and for the output result
+// first argument is unmodifed and second is trashed
+// regfe is supposed to contain 0xfefefefefefefefe
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
+    "movq " #rega ", " #regr "  \n\t"\
+    "pand " #regb ", " #regr "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pand " #regfe "," #regb "  \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "paddb " #regb ", " #regr " \n\t"
+
+#define PAVGB_MMX(rega, regb, regr, regfe) \
+    "movq " #rega ", " #regr "  \n\t"\
+    "por  " #regb ", " #regr "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pand " #regfe "," #regb "  \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "psubb " #regb ", " #regr " \n\t"
+
+// mm6 is supposed to contain 0xfefefefefefefefe
+#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
+    "movq " #rega ", " #regr "  \n\t"\
+    "movq " #regc ", " #regp "  \n\t"\
+    "pand " #regb ", " #regr "  \n\t"\
+    "pand " #regd ", " #regp "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pxor " #regc ", " #regd "  \n\t"\
+    "pand %%mm6, " #regb "      \n\t"\
+    "pand %%mm6, " #regd "      \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "psrlq $1, " #regd "        \n\t"\
+    "paddb " #regb ", " #regr " \n\t"\
+    "paddb " #regd ", " #regp " \n\t"
+
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
+    "movq " #rega ", " #regr "  \n\t"\
+    "movq " #regc ", " #regp "  \n\t"\
+    "por  " #regb ", " #regr "  \n\t"\
+    "por  " #regd ", " #regp "  \n\t"\
+    "pxor " #rega ", " #regb "  \n\t"\
+    "pxor " #regc ", " #regd "  \n\t"\
+    "pand %%mm6, " #regb "      \n\t"\
+    "pand %%mm6, " #regd "      \n\t"\
+    "psrlq $1, " #regd "        \n\t"\
+    "psrlq $1, " #regb "        \n\t"\
+    "psubb " #regb ", " #regr " \n\t"\
+    "psubb " #regd ", " #regp " \n\t"
+
+/***********************************/
+/* MMX no rounding */
+#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
+#define SET_RND  MOVQ_WONE
+#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
+
+#include "dsputil_mmx_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+/***********************************/
+/* MMX rounding */
+
+#define DEF(x, y) x ## _ ## y ##_mmx
+#define SET_RND  MOVQ_WTWO
+#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
+
+#include "dsputil_mmx_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+
+/***********************************/
+/* 3Dnow specific */
+
+#define DEF(x) x ## _3dnow
+#define PAVGB "pavgusb"
+
+#include "dsputil_mmx_avg_template.c"
+
+#undef DEF
+#undef PAVGB
+
+/***********************************/
+/* MMX2 specific */
+
+#define DEF(x) x ## _mmx2
+
+/* Introduced only in MMX2 set */
+#define PAVGB "pavgb"
+
+#include "dsputil_mmx_avg_template.c"
+
+#undef DEF
+#undef PAVGB
+
+#define put_no_rnd_pixels16_mmx put_pixels16_mmx
+#define put_no_rnd_pixels8_mmx put_pixels8_mmx
+#define put_pixels16_mmx2 put_pixels16_mmx
+#define put_pixels8_mmx2 put_pixels8_mmx
+#define put_pixels4_mmx2 put_pixels4_mmx
+#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
+#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
+#define put_pixels16_3dnow put_pixels16_mmx
+#define put_pixels8_3dnow put_pixels8_mmx
+#define put_pixels4_3dnow put_pixels4_mmx
+#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
+#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
+
+/***********************************/
+/* standard MMX */
+
+void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+    const DCTELEM *p;
+    uint8_t *pix;
+
+    /* read the pixels */
+    p = block;
+    pix = pixels;
+    /* unrolled loop */
+        __asm__ volatile(
+                "movq   %3, %%mm0               \n\t"
+                "movq   8%3, %%mm1              \n\t"
+                "movq   16%3, %%mm2             \n\t"
+                "movq   24%3, %%mm3             \n\t"
+                "movq   32%3, %%mm4             \n\t"
+                "movq   40%3, %%mm5             \n\t"
+                "movq   48%3, %%mm6             \n\t"
+                "movq   56%3, %%mm7             \n\t"
+                "packuswb %%mm1, %%mm0          \n\t"
+                "packuswb %%mm3, %%mm2          \n\t"
+                "packuswb %%mm5, %%mm4          \n\t"
+                "packuswb %%mm7, %%mm6          \n\t"
+                "movq   %%mm0, (%0)             \n\t"
+                "movq   %%mm2, (%0, %1)         \n\t"
+                "movq   %%mm4, (%0, %1, 2)      \n\t"
+                "movq   %%mm6, (%0, %2)         \n\t"
+                ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
+                :"memory");
+        pix += line_size*4;
+        p += 32;
+
+    // if here would be an exact copy of the code above
+    // compiler would generate some very strange code
+    // thus using "r"
+    __asm__ volatile(
+            "movq       (%3), %%mm0             \n\t"
+            "movq       8(%3), %%mm1            \n\t"
+            "movq       16(%3), %%mm2           \n\t"
+            "movq       24(%3), %%mm3           \n\t"
+            "movq       32(%3), %%mm4           \n\t"
+            "movq       40(%3), %%mm5           \n\t"
+            "movq       48(%3), %%mm6           \n\t"
+            "movq       56(%3), %%mm7           \n\t"
+            "packuswb %%mm1, %%mm0              \n\t"
+            "packuswb %%mm3, %%mm2              \n\t"
+            "packuswb %%mm5, %%mm4              \n\t"
+            "packuswb %%mm7, %%mm6              \n\t"
+            "movq       %%mm0, (%0)             \n\t"
+            "movq       %%mm2, (%0, %1)         \n\t"
+            "movq       %%mm4, (%0, %1, 2)      \n\t"
+            "movq       %%mm6, (%0, %2)         \n\t"
+            ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
+            :"memory");
+}
+
+static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
+  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+
+void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+    int i;
+
+    movq_m2r(*vector128, mm1);
+    for (i = 0; i < 8; i++) {
+        movq_m2r(*(block), mm0);
+        packsswb_m2r(*(block + 4), mm0);
+        block += 8;
+        paddb_r2r(mm1, mm0);
+        movq_r2m(mm0, *pixels);
+        pixels += line_size;
+    }
+}
+
+void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+    const DCTELEM *p;
+    uint8_t *pix;
+    int i;
+
+    /* read the pixels */
+    p = block;
+    pix = pixels;
+    MOVQ_ZERO(mm7);
+    i = 4;
+    do {
+        __asm__ volatile(
+                "movq   (%2), %%mm0     \n\t"
+                "movq   8(%2), %%mm1    \n\t"
+                "movq   16(%2), %%mm2   \n\t"
+                "movq   24(%2), %%mm3   \n\t"
+                "movq   %0, %%mm4       \n\t"
+                "movq   %1, %%mm6       \n\t"
+                "movq   %%mm4, %%mm5    \n\t"
+                "punpcklbw %%mm7, %%mm4 \n\t"
+                "punpckhbw %%mm7, %%mm5 \n\t"
+                "paddsw %%mm4, %%mm0    \n\t"
+                "paddsw %%mm5, %%mm1    \n\t"
+                "movq   %%mm6, %%mm5    \n\t"
+                "punpcklbw %%mm7, %%mm6 \n\t"
+                "punpckhbw %%mm7, %%mm5 \n\t"
+                "paddsw %%mm6, %%mm2    \n\t"
+                "paddsw %%mm5, %%mm3    \n\t"
+                "packuswb %%mm1, %%mm0  \n\t"
+                "packuswb %%mm3, %%mm2  \n\t"
+                "movq   %%mm0, %0       \n\t"
+                "movq   %%mm2, %1       \n\t"
+                :"+m"(*pix), "+m"(*(pix+line_size))
+                :"r"(p)
+                :"memory");
+        pix += line_size*2;
+        p += 16;
+    } while (--i);
+}
+
+static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+         "lea (%3, %3), %%"REG_a"       \n\t"
+         ASMALIGN(3)
+         "1:                            \n\t"
+         "movd (%1), %%mm0              \n\t"
+         "movd (%1, %3), %%mm1          \n\t"
+         "movd %%mm0, (%2)              \n\t"
+         "movd %%mm1, (%2, %3)          \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "movd (%1), %%mm0              \n\t"
+         "movd (%1, %3), %%mm1          \n\t"
+         "movd %%mm0, (%2)              \n\t"
+         "movd %%mm1, (%2, %3)          \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "subl $4, %0                   \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((x86_reg)line_size)
+         : "%"REG_a, "memory"
+        );
+}
+
+static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+         "lea (%3, %3), %%"REG_a"       \n\t"
+         ASMALIGN(3)
+         "1:                            \n\t"
+         "movq (%1), %%mm0              \n\t"
+         "movq (%1, %3), %%mm1          \n\t"
+         "movq %%mm0, (%2)              \n\t"
+         "movq %%mm1, (%2, %3)          \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "movq (%1), %%mm0              \n\t"
+         "movq (%1, %3), %%mm1          \n\t"
+         "movq %%mm0, (%2)              \n\t"
+         "movq %%mm1, (%2, %3)          \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "subl $4, %0                   \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((x86_reg)line_size)
+         : "%"REG_a, "memory"
+        );
+}
+
+static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+         "lea (%3, %3), %%"REG_a"       \n\t"
+         ASMALIGN(3)
+         "1:                            \n\t"
+         "movq (%1), %%mm0              \n\t"
+         "movq 8(%1), %%mm4             \n\t"
+         "movq (%1, %3), %%mm1          \n\t"
+         "movq 8(%1, %3), %%mm5         \n\t"
+         "movq %%mm0, (%2)              \n\t"
+         "movq %%mm4, 8(%2)             \n\t"
+         "movq %%mm1, (%2, %3)          \n\t"
+         "movq %%mm5, 8(%2, %3)         \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "movq (%1), %%mm0              \n\t"
+         "movq 8(%1), %%mm4             \n\t"
+         "movq (%1, %3), %%mm1          \n\t"
+         "movq 8(%1, %3), %%mm5         \n\t"
+         "movq %%mm0, (%2)              \n\t"
+         "movq %%mm4, 8(%2)             \n\t"
+         "movq %%mm1, (%2, %3)          \n\t"
+         "movq %%mm5, 8(%2, %3)         \n\t"
+         "add %%"REG_a", %1             \n\t"
+         "add %%"REG_a", %2             \n\t"
+         "subl $4, %0                   \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((x86_reg)line_size)
+         : "%"REG_a, "memory"
+        );
+}
+
+static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+         "1:                            \n\t"
+         "movdqu (%1), %%xmm0           \n\t"
+         "movdqu (%1,%3), %%xmm1        \n\t"
+         "movdqu (%1,%3,2), %%xmm2      \n\t"
+         "movdqu (%1,%4), %%xmm3        \n\t"
+         "movdqa %%xmm0, (%2)           \n\t"
+         "movdqa %%xmm1, (%2,%3)        \n\t"
+         "movdqa %%xmm2, (%2,%3,2)      \n\t"
+         "movdqa %%xmm3, (%2,%4)        \n\t"
+         "subl $4, %0                   \n\t"
+         "lea (%1,%3,4), %1             \n\t"
+         "lea (%2,%3,4), %2             \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+         : "memory"
+        );
+}
+
+static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+         "1:                            \n\t"
+         "movdqu (%1), %%xmm0           \n\t"
+         "movdqu (%1,%3), %%xmm1        \n\t"
+         "movdqu (%1,%3,2), %%xmm2      \n\t"
+         "movdqu (%1,%4), %%xmm3        \n\t"
+         "pavgb  (%2), %%xmm0           \n\t"
+         "pavgb  (%2,%3), %%xmm1        \n\t"
+         "pavgb  (%2,%3,2), %%xmm2      \n\t"
+         "pavgb  (%2,%4), %%xmm3        \n\t"
+         "movdqa %%xmm0, (%2)           \n\t"
+         "movdqa %%xmm1, (%2,%3)        \n\t"
+         "movdqa %%xmm2, (%2,%3,2)      \n\t"
+         "movdqa %%xmm3, (%2,%4)        \n\t"
+         "subl $4, %0                   \n\t"
+         "lea (%1,%3,4), %1             \n\t"
+         "lea (%2,%3,4), %2             \n\t"
+         "jnz 1b                        \n\t"
+         : "+g"(h), "+r" (pixels),  "+r" (block)
+         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+         : "memory"
+        );
+}
+
+#define CLEAR_BLOCKS(name,n) \
+static void name(DCTELEM *blocks)\
+{\
+    __asm__ volatile(\
+                "pxor %%mm7, %%mm7              \n\t"\
+                "mov     %1, %%"REG_a"          \n\t"\
+                "1:                             \n\t"\
+                "movq %%mm7, (%0, %%"REG_a")    \n\t"\
+                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"\
+                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"\
+                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"\
+                "add $32, %%"REG_a"             \n\t"\
+                " js 1b                         \n\t"\
+                : : "r" (((uint8_t *)blocks)+128*n),\
+                    "i" (-128*n)\
+                : "%"REG_a\
+        );\
+}
+CLEAR_BLOCKS(clear_blocks_mmx, 6)
+CLEAR_BLOCKS(clear_block_mmx, 1)
+
+static void clear_block_sse(DCTELEM *block)
+{
+    __asm__ volatile(
+        "xorps  %%xmm0, %%xmm0  \n"
+        "movaps %%xmm0,    (%0) \n"
+        "movaps %%xmm0,  16(%0) \n"
+        "movaps %%xmm0,  32(%0) \n"
+        "movaps %%xmm0,  48(%0) \n"
+        "movaps %%xmm0,  64(%0) \n"
+        "movaps %%xmm0,  80(%0) \n"
+        "movaps %%xmm0,  96(%0) \n"
+        "movaps %%xmm0, 112(%0) \n"
+        :: "r"(block)
+        : "memory"
+    );
+}
+
+static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
+    x86_reg i=0;
+    __asm__ volatile(
+        "jmp 2f                         \n\t"
+        "1:                             \n\t"
+        "movq  (%1, %0), %%mm0          \n\t"
+        "movq  (%2, %0), %%mm1          \n\t"
+        "paddb %%mm0, %%mm1             \n\t"
+        "movq %%mm1, (%2, %0)           \n\t"
+        "movq 8(%1, %0), %%mm0          \n\t"
+        "movq 8(%2, %0), %%mm1          \n\t"
+        "paddb %%mm0, %%mm1             \n\t"
+        "movq %%mm1, 8(%2, %0)          \n\t"
+        "add $16, %0                    \n\t"
+        "2:                             \n\t"
+        "cmp %3, %0                     \n\t"
+        " js 1b                         \n\t"
+        : "+r" (i)
+        : "r"(src), "r"(dst), "r"((x86_reg)w-15)
+    );
+    for(; i<w; i++)
+        dst[i+0] += src[i+0];
+}
+
+static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
+    x86_reg i=0;
+    __asm__ volatile(
+        "jmp 2f                         \n\t"
+        "1:                             \n\t"
+        "movq   (%2, %0), %%mm0         \n\t"
+        "movq  8(%2, %0), %%mm1         \n\t"
+        "paddb  (%3, %0), %%mm0         \n\t"
+        "paddb 8(%3, %0), %%mm1         \n\t"
+        "movq %%mm0,  (%1, %0)          \n\t"
+        "movq %%mm1, 8(%1, %0)          \n\t"
+        "add $16, %0                    \n\t"
+        "2:                             \n\t"
+        "cmp %4, %0                     \n\t"
+        " js 1b                         \n\t"
+        : "+r" (i)
+        : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
+    );
+    for(; i<w; i++)
+        dst[i] = src1[i] + src2[i];
+}
+
+#define H263_LOOP_FILTER \
+        "pxor %%mm7, %%mm7              \n\t"\
+        "movq  %0, %%mm0                \n\t"\
+        "movq  %0, %%mm1                \n\t"\
+        "movq  %3, %%mm2                \n\t"\
+        "movq  %3, %%mm3                \n\t"\
+        "punpcklbw %%mm7, %%mm0         \n\t"\
+        "punpckhbw %%mm7, %%mm1         \n\t"\
+        "punpcklbw %%mm7, %%mm2         \n\t"\
+        "punpckhbw %%mm7, %%mm3         \n\t"\
+        "psubw %%mm2, %%mm0             \n\t"\
+        "psubw %%mm3, %%mm1             \n\t"\
+        "movq  %1, %%mm2                \n\t"\
+        "movq  %1, %%mm3                \n\t"\
+        "movq  %2, %%mm4                \n\t"\
+        "movq  %2, %%mm5                \n\t"\
+        "punpcklbw %%mm7, %%mm2         \n\t"\
+        "punpckhbw %%mm7, %%mm3         \n\t"\
+        "punpcklbw %%mm7, %%mm4         \n\t"\
+        "punpckhbw %%mm7, %%mm5         \n\t"\
+        "psubw %%mm2, %%mm4             \n\t"\
+        "psubw %%mm3, %%mm5             \n\t"\
+        "psllw $2, %%mm4                \n\t"\
+        "psllw $2, %%mm5                \n\t"\
+        "paddw %%mm0, %%mm4             \n\t"\
+        "paddw %%mm1, %%mm5             \n\t"\
+        "pxor %%mm6, %%mm6              \n\t"\
+        "pcmpgtw %%mm4, %%mm6           \n\t"\
+        "pcmpgtw %%mm5, %%mm7           \n\t"\
+        "pxor %%mm6, %%mm4              \n\t"\
+        "pxor %%mm7, %%mm5              \n\t"\
+        "psubw %%mm6, %%mm4             \n\t"\
+        "psubw %%mm7, %%mm5             \n\t"\
+        "psrlw $3, %%mm4                \n\t"\
+        "psrlw $3, %%mm5                \n\t"\
+        "packuswb %%mm5, %%mm4          \n\t"\
+        "packsswb %%mm7, %%mm6          \n\t"\
+        "pxor %%mm7, %%mm7              \n\t"\
+        "movd %4, %%mm2                 \n\t"\
+        "punpcklbw %%mm2, %%mm2         \n\t"\
+        "punpcklbw %%mm2, %%mm2         \n\t"\
+        "punpcklbw %%mm2, %%mm2         \n\t"\
+        "psubusb %%mm4, %%mm2           \n\t"\
+        "movq %%mm2, %%mm3              \n\t"\
+        "psubusb %%mm4, %%mm3           \n\t"\
+        "psubb %%mm3, %%mm2             \n\t"\
+        "movq %1, %%mm3                 \n\t"\
+        "movq %2, %%mm4                 \n\t"\
+        "pxor %%mm6, %%mm3              \n\t"\
+        "pxor %%mm6, %%mm4              \n\t"\
+        "paddusb %%mm2, %%mm3           \n\t"\
+        "psubusb %%mm2, %%mm4           \n\t"\
+        "pxor %%mm6, %%mm3              \n\t"\
+        "pxor %%mm6, %%mm4              \n\t"\
+        "paddusb %%mm2, %%mm2           \n\t"\
+        "packsswb %%mm1, %%mm0          \n\t"\
+        "pcmpgtb %%mm0, %%mm7           \n\t"\
+        "pxor %%mm7, %%mm0              \n\t"\
+        "psubb %%mm7, %%mm0             \n\t"\
+        "movq %%mm0, %%mm1              \n\t"\
+        "psubusb %%mm2, %%mm0           \n\t"\
+        "psubb %%mm0, %%mm1             \n\t"\
+        "pand %5, %%mm1                 \n\t"\
+        "psrlw $2, %%mm1                \n\t"\
+        "pxor %%mm7, %%mm1              \n\t"\
+        "psubb %%mm7, %%mm1             \n\t"\
+        "movq %0, %%mm5                 \n\t"\
+        "movq %3, %%mm6                 \n\t"\
+        "psubb %%mm1, %%mm5             \n\t"\
+        "paddb %%mm1, %%mm6             \n\t"
+
+static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
+    if(ENABLE_ANY_H263) {
+    const int strength= ff_h263_loop_filter_strength[qscale];
+
+    __asm__ volatile(
+
+        H263_LOOP_FILTER
+
+        "movq %%mm3, %1                 \n\t"
+        "movq %%mm4, %2                 \n\t"
+        "movq %%mm5, %0                 \n\t"
+        "movq %%mm6, %3                 \n\t"
+        : "+m" (*(uint64_t*)(src - 2*stride)),
+          "+m" (*(uint64_t*)(src - 1*stride)),
+          "+m" (*(uint64_t*)(src + 0*stride)),
+          "+m" (*(uint64_t*)(src + 1*stride))
+        : "g" (2*strength), "m"(ff_pb_FC)
+    );
+    }
+}
+
+static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+    __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
+        "movd  %4, %%mm0                \n\t"
+        "movd  %5, %%mm1                \n\t"
+        "movd  %6, %%mm2                \n\t"
+        "movd  %7, %%mm3                \n\t"
+        "punpcklbw %%mm1, %%mm0         \n\t"
+        "punpcklbw %%mm3, %%mm2         \n\t"
+        "movq %%mm0, %%mm1              \n\t"
+        "punpcklwd %%mm2, %%mm0         \n\t"
+        "punpckhwd %%mm2, %%mm1         \n\t"
+        "movd  %%mm0, %0                \n\t"
+        "punpckhdq %%mm0, %%mm0         \n\t"
+        "movd  %%mm0, %1                \n\t"
+        "movd  %%mm1, %2                \n\t"
+        "punpckhdq %%mm1, %%mm1         \n\t"
+        "movd  %%mm1, %3                \n\t"
+
+        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
+          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
+          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
+          "=m" (*(uint32_t*)(dst + 3*dst_stride))
+        :  "m" (*(uint32_t*)(src + 0*src_stride)),
+           "m" (*(uint32_t*)(src + 1*src_stride)),
+           "m" (*(uint32_t*)(src + 2*src_stride)),
+           "m" (*(uint32_t*)(src + 3*src_stride))
+    );
+}
+
+static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
+    if(ENABLE_ANY_H263) {
+    const int strength= ff_h263_loop_filter_strength[qscale];
+    DECLARE_ALIGNED(8, uint64_t, temp[4]);
+    uint8_t *btemp= (uint8_t*)temp;
+
+    src -= 2;
+
+    transpose4x4(btemp  , src           , 8, stride);
+    transpose4x4(btemp+4, src + 4*stride, 8, stride);
+    __asm__ volatile(
+        H263_LOOP_FILTER // 5 3 4 6
+
+        : "+m" (temp[0]),
+          "+m" (temp[1]),
+          "+m" (temp[2]),
+          "+m" (temp[3])
+        : "g" (2*strength), "m"(ff_pb_FC)
+    );
+
+    __asm__ volatile(
+        "movq %%mm5, %%mm1              \n\t"
+        "movq %%mm4, %%mm0              \n\t"
+        "punpcklbw %%mm3, %%mm5         \n\t"
+        "punpcklbw %%mm6, %%mm4         \n\t"
+        "punpckhbw %%mm3, %%mm1         \n\t"
+        "punpckhbw %%mm6, %%mm0         \n\t"
+        "movq %%mm5, %%mm3              \n\t"
+        "movq %%mm1, %%mm6              \n\t"
+        "punpcklwd %%mm4, %%mm5         \n\t"
+        "punpcklwd %%mm0, %%mm1         \n\t"
+        "punpckhwd %%mm4, %%mm3         \n\t"
+        "punpckhwd %%mm0, %%mm6         \n\t"
+        "movd %%mm5, (%0)               \n\t"
+        "punpckhdq %%mm5, %%mm5         \n\t"
+        "movd %%mm5, (%0,%2)            \n\t"
+        "movd %%mm3, (%0,%2,2)          \n\t"
+        "punpckhdq %%mm3, %%mm3         \n\t"
+        "movd %%mm3, (%0,%3)            \n\t"
+        "movd %%mm1, (%1)               \n\t"
+        "punpckhdq %%mm1, %%mm1         \n\t"
+        "movd %%mm1, (%1,%2)            \n\t"
+        "movd %%mm6, (%1,%2,2)          \n\t"
+        "punpckhdq %%mm6, %%mm6         \n\t"
+        "movd %%mm6, (%1,%3)            \n\t"
+        :: "r" (src),
+           "r" (src + 4*stride),
+           "r" ((x86_reg)   stride ),
+           "r" ((x86_reg)(3*stride))
+    );
+    }
+}
+
+/* draw the edges of width 'w' of an image of size width, height
+   this mmx version can only handle w==8 || w==16 */
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
+{
+    uint8_t *ptr, *last_line;
+    int i;
+
+    last_line = buf + (height - 1) * wrap;
+    /* left and right */
+    ptr = buf;
+    if(w==8)
+    {
+        __asm__ volatile(
+                "1:                             \n\t"
+                "movd (%0), %%mm0               \n\t"
+                "punpcklbw %%mm0, %%mm0         \n\t"
+                "punpcklwd %%mm0, %%mm0         \n\t"
+                "punpckldq %%mm0, %%mm0         \n\t"
+                "movq %%mm0, -8(%0)             \n\t"
+                "movq -8(%0, %2), %%mm1         \n\t"
+                "punpckhbw %%mm1, %%mm1         \n\t"
+                "punpckhwd %%mm1, %%mm1         \n\t"
+                "punpckhdq %%mm1, %%mm1         \n\t"
+                "movq %%mm1, (%0, %2)           \n\t"
+                "add %1, %0                     \n\t"
+                "cmp %3, %0                     \n\t"
+                " jb 1b                         \n\t"
+                : "+r" (ptr)
+                : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
+        );
+    }
+    else
+    {
+        __asm__ volatile(
+                "1:                             \n\t"
+                "movd (%0), %%mm0               \n\t"
+                "punpcklbw %%mm0, %%mm0         \n\t"
+                "punpcklwd %%mm0, %%mm0         \n\t"
+                "punpckldq %%mm0, %%mm0         \n\t"
+                "movq %%mm0, -8(%0)             \n\t"
+                "movq %%mm0, -16(%0)            \n\t"
+                "movq -8(%0, %2), %%mm1         \n\t"
+                "punpckhbw %%mm1, %%mm1         \n\t"
+                "punpckhwd %%mm1, %%mm1         \n\t"
+                "punpckhdq %%mm1, %%mm1         \n\t"
+                "movq %%mm1, (%0, %2)           \n\t"
+                "movq %%mm1, 8(%0, %2)          \n\t"
+                "add %1, %0                     \n\t"
+                "cmp %3, %0                     \n\t"
+                " jb 1b                         \n\t"
+                : "+r" (ptr)
+                : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
+        );
+    }
+
+    for(i=0;i<w;i+=4) {
+        /* top and bottom (and hopefully also the corners) */
+        ptr= buf - (i + 1) * wrap - w;
+        __asm__ volatile(
+                "1:                             \n\t"
+                "movq (%1, %0), %%mm0           \n\t"
+                "movq %%mm0, (%0)               \n\t"
+                "movq %%mm0, (%0, %2)           \n\t"
+                "movq %%mm0, (%0, %2, 2)        \n\t"
+                "movq %%mm0, (%0, %3)           \n\t"
+                "add $8, %0                     \n\t"
+                "cmp %4, %0                     \n\t"
+                " jb 1b                         \n\t"
+                : "+r" (ptr)
+                : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
+        );
+        ptr= last_line + (i + 1) * wrap - w;
+        __asm__ volatile(
+                "1:                             \n\t"
+                "movq (%1, %0), %%mm0           \n\t"
+                "movq %%mm0, (%0)               \n\t"
+                "movq %%mm0, (%0, %2)           \n\t"
+                "movq %%mm0, (%0, %2, 2)        \n\t"
+                "movq %%mm0, (%0, %3)           \n\t"
+                "add $8, %0                     \n\t"
+                "cmp %4, %0                     \n\t"
+                " jb 1b                         \n\t"
+                : "+r" (ptr)
+                : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
+        );
+    }
+}
+
+#define PAETH(cpu, abs3)\
+static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
+{\
+    x86_reg i = -bpp;\
+    x86_reg end = w-3;\
+    __asm__ volatile(\
+        "pxor      %%mm7, %%mm7 \n"\
+        "movd    (%1,%0), %%mm0 \n"\
+        "movd    (%2,%0), %%mm1 \n"\
+        "punpcklbw %%mm7, %%mm0 \n"\
+        "punpcklbw %%mm7, %%mm1 \n"\
+        "add       %4, %0 \n"\
+        "1: \n"\
+        "movq      %%mm1, %%mm2 \n"\
+        "movd    (%2,%0), %%mm1 \n"\
+        "movq      %%mm2, %%mm3 \n"\
+        "punpcklbw %%mm7, %%mm1 \n"\
+        "movq      %%mm2, %%mm4 \n"\
+        "psubw     %%mm1, %%mm3 \n"\
+        "psubw     %%mm0, %%mm4 \n"\
+        "movq      %%mm3, %%mm5 \n"\
+        "paddw     %%mm4, %%mm5 \n"\
+        abs3\
+        "movq      %%mm4, %%mm6 \n"\
+        "pminsw    %%mm5, %%mm6 \n"\
+        "pcmpgtw   %%mm6, %%mm3 \n"\
+        "pcmpgtw   %%mm5, %%mm4 \n"\
+        "movq      %%mm4, %%mm6 \n"\
+        "pand      %%mm3, %%mm4 \n"\
+        "pandn     %%mm3, %%mm6 \n"\
+        "pandn     %%mm0, %%mm3 \n"\
+        "movd    (%3,%0), %%mm0 \n"\
+        "pand      %%mm1, %%mm6 \n"\
+        "pand      %%mm4, %%mm2 \n"\
+        "punpcklbw %%mm7, %%mm0 \n"\
+        "movq      %6,    %%mm5 \n"\
+        "paddw     %%mm6, %%mm0 \n"\
+        "paddw     %%mm2, %%mm3 \n"\
+        "paddw     %%mm3, %%mm0 \n"\
+        "pand      %%mm5, %%mm0 \n"\
+        "movq      %%mm0, %%mm3 \n"\
+        "packuswb  %%mm3, %%mm3 \n"\
+        "movd      %%mm3, (%1,%0) \n"\
+        "add       %4, %0 \n"\
+        "cmp       %5, %0 \n"\
+        "jle 1b \n"\
+        :"+r"(i)\
+        :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
+         "m"(ff_pw_255)\
+        :"memory"\
+    );\
+}
+
+#define ABS3_MMX2\
+        "psubw     %%mm5, %%mm7 \n"\
+        "pmaxsw    %%mm7, %%mm5 \n"\
+        "pxor      %%mm6, %%mm6 \n"\
+        "pxor      %%mm7, %%mm7 \n"\
+        "psubw     %%mm3, %%mm6 \n"\
+        "psubw     %%mm4, %%mm7 \n"\
+        "pmaxsw    %%mm6, %%mm3 \n"\
+        "pmaxsw    %%mm7, %%mm4 \n"\
+        "pxor      %%mm7, %%mm7 \n"
+
+#define ABS3_SSSE3\
+        "pabsw     %%mm3, %%mm3 \n"\
+        "pabsw     %%mm4, %%mm4 \n"\
+        "pabsw     %%mm5, %%mm5 \n"
+
+PAETH(mmx2, ABS3_MMX2)
+#ifdef HAVE_SSSE3
+PAETH(ssse3, ABS3_SSSE3)
+#endif
+
+#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
+        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
+        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
+        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
+        "movq "#in7", " #m3 "             \n\t" /* d */\
+        "movq "#in0", %%mm5               \n\t" /* D */\
+        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
+        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
+        "movq "#in1", %%mm5               \n\t" /* C */\
+        "movq "#in2", %%mm6               \n\t" /* B */\
+        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
+        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
+        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
+        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
+        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
+        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
+        "psraw $5, %%mm5                  \n\t"\
+        "packuswb %%mm5, %%mm5            \n\t"\
+        OP(%%mm5, out, %%mm7, d)
+
+#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
+static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    uint64_t temp;\
+\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7                \n\t"\
+        "1:                               \n\t"\
+        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
+        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
+        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
+        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
+        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
+        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
+        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
+        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
+        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
+        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
+        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
+        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
+        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
+        "paddw %%mm3, %%mm5               \n\t" /* b */\
+        "paddw %%mm2, %%mm6               \n\t" /* c */\
+        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
+        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
+        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
+        "paddw %%mm4, %%mm0               \n\t" /* a */\
+        "paddw %%mm1, %%mm5               \n\t" /* d */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
+        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
+        "paddw %6, %%mm6                  \n\t"\
+        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm0                  \n\t"\
+        "movq %%mm0, %5                   \n\t"\
+        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
+        \
+        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
+        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
+        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
+        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
+        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
+        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
+        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
+        "paddw %%mm0, %%mm2               \n\t" /* b */\
+        "paddw %%mm5, %%mm3               \n\t" /* c */\
+        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
+        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
+        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
+        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
+        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
+        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
+        "paddw %%mm2, %%mm1               \n\t" /* a */\
+        "paddw %%mm6, %%mm4               \n\t" /* d */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
+        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
+        "paddw %6, %%mm1                  \n\t"\
+        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
+        "psraw $5, %%mm3                  \n\t"\
+        "movq %5, %%mm1                   \n\t"\
+        "packuswb %%mm3, %%mm1            \n\t"\
+        OP_MMX2(%%mm1, (%1),%%mm4, q)\
+        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
+        \
+        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
+        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
+        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
+        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
+        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
+        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
+        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
+        "paddw %%mm1, %%mm5               \n\t" /* b */\
+        "paddw %%mm4, %%mm0               \n\t" /* c */\
+        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
+        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
+        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
+        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
+        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
+        "paddw %%mm3, %%mm2               \n\t" /* d */\
+        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
+        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
+        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
+        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
+        "paddw %%mm2, %%mm6               \n\t" /* a */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
+        "paddw %6, %%mm0                  \n\t"\
+        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm0                  \n\t"\
+        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
+        \
+        "paddw %%mm5, %%mm3               \n\t" /* a */\
+        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
+        "paddw %%mm4, %%mm6               \n\t" /* b */\
+        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
+        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
+        "paddw %%mm1, %%mm4               \n\t" /* c */\
+        "paddw %%mm2, %%mm5               \n\t" /* d */\
+        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
+        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
+        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
+        "paddw %6, %%mm4                  \n\t"\
+        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm4                  \n\t"\
+        "packuswb %%mm4, %%mm0            \n\t"\
+        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
+        \
+        "add %3, %0                       \n\t"\
+        "add %4, %1                       \n\t"\
+        "decl %2                          \n\t"\
+        " jnz 1b                          \n\t"\
+        : "+a"(src), "+c"(dst), "+D"(h)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
+        : "memory"\
+    );\
+}\
+\
+static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    int i;\
+    int16_t temp[16];\
+    /* quick HACK, XXX FIXME MUST be optimized */\
+    for(i=0; i<h; i++)\
+    {\
+        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
+        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
+        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
+        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
+        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
+        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
+        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
+        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
+        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
+        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
+        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
+        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
+        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
+        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
+        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
+        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
+        __asm__ volatile(\
+            "movq (%0), %%mm0               \n\t"\
+            "movq 8(%0), %%mm1              \n\t"\
+            "paddw %2, %%mm0                \n\t"\
+            "paddw %2, %%mm1                \n\t"\
+            "psraw $5, %%mm0                \n\t"\
+            "psraw $5, %%mm1                \n\t"\
+            "packuswb %%mm1, %%mm0          \n\t"\
+            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
+            "movq 16(%0), %%mm0             \n\t"\
+            "movq 24(%0), %%mm1             \n\t"\
+            "paddw %2, %%mm0                \n\t"\
+            "paddw %2, %%mm1                \n\t"\
+            "psraw $5, %%mm0                \n\t"\
+            "psraw $5, %%mm1                \n\t"\
+            "packuswb %%mm1, %%mm0          \n\t"\
+            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
+            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
+            : "memory"\
+        );\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}\
+\
+static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7                \n\t"\
+        "1:                               \n\t"\
+        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
+        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
+        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
+        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
+        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
+        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
+        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
+        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
+        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
+        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
+        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
+        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
+        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
+        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
+        "paddw %%mm3, %%mm5               \n\t" /* b */\
+        "paddw %%mm2, %%mm6               \n\t" /* c */\
+        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
+        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
+        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
+        "paddw %%mm4, %%mm0               \n\t" /* a */\
+        "paddw %%mm1, %%mm5               \n\t" /* d */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
+        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
+        "paddw %5, %%mm6                  \n\t"\
+        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm0                  \n\t"\
+        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
+        \
+        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
+        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
+        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
+        "paddw %%mm5, %%mm1               \n\t" /* a */\
+        "paddw %%mm6, %%mm2               \n\t" /* b */\
+        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
+        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
+        "paddw %%mm6, %%mm3               \n\t" /* c */\
+        "paddw %%mm5, %%mm4               \n\t" /* d */\
+        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
+        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
+        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
+        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
+        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
+        "paddw %5, %%mm1                  \n\t"\
+        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
+        "psraw $5, %%mm3                  \n\t"\
+        "packuswb %%mm3, %%mm0            \n\t"\
+        OP_MMX2(%%mm0, (%1), %%mm4, q)\
+        \
+        "add %3, %0                       \n\t"\
+        "add %4, %1                       \n\t"\
+        "decl %2                          \n\t"\
+        " jnz 1b                          \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(h)\
+        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
+        : "memory"\
+    );\
+}\
+\
+static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    int i;\
+    int16_t temp[8];\
+    /* quick HACK, XXX FIXME MUST be optimized */\
+    for(i=0; i<h; i++)\
+    {\
+        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
+        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
+        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
+        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
+        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
+        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
+        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
+        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
+        __asm__ volatile(\
+            "movq (%0), %%mm0           \n\t"\
+            "movq 8(%0), %%mm1          \n\t"\
+            "paddw %2, %%mm0            \n\t"\
+            "paddw %2, %%mm1            \n\t"\
+            "psraw $5, %%mm0            \n\t"\
+            "psraw $5, %%mm1            \n\t"\
+            "packuswb %%mm1, %%mm0      \n\t"\
+            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
+            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
+            :"memory"\
+        );\
+        dst+=dstStride;\
+        src+=srcStride;\
+    }\
+}
+
+#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
+\
+static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    uint64_t temp[17*4];\
+    uint64_t *temp_ptr= temp;\
+    int count= 17;\
+\
+    /*FIXME unroll */\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7              \n\t"\
+        "1:                             \n\t"\
+        "movq (%0), %%mm0               \n\t"\
+        "movq (%0), %%mm1               \n\t"\
+        "movq 8(%0), %%mm2              \n\t"\
+        "movq 8(%0), %%mm3              \n\t"\
+        "punpcklbw %%mm7, %%mm0         \n\t"\
+        "punpckhbw %%mm7, %%mm1         \n\t"\
+        "punpcklbw %%mm7, %%mm2         \n\t"\
+        "punpckhbw %%mm7, %%mm3         \n\t"\
+        "movq %%mm0, (%1)               \n\t"\
+        "movq %%mm1, 17*8(%1)           \n\t"\
+        "movq %%mm2, 2*17*8(%1)         \n\t"\
+        "movq %%mm3, 3*17*8(%1)         \n\t"\
+        "add $8, %1                     \n\t"\
+        "add %3, %0                     \n\t"\
+        "decl %2                        \n\t"\
+        " jnz 1b                        \n\t"\
+        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
+        : "r" ((x86_reg)srcStride)\
+        : "memory"\
+    );\
+    \
+    temp_ptr= temp;\
+    count=4;\
+    \
+/*FIXME reorder for speed */\
+    __asm__ volatile(\
+        /*"pxor %%mm7, %%mm7              \n\t"*/\
+        "1:                             \n\t"\
+        "movq (%0), %%mm0               \n\t"\
+        "movq 8(%0), %%mm1              \n\t"\
+        "movq 16(%0), %%mm2             \n\t"\
+        "movq 24(%0), %%mm3             \n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
+        "add %4, %1                     \n\t"\
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
+        \
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
+        "add %4, %1                     \n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
+        "add %4, %1                     \n\t"\
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
+        "add %4, %1                     \n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
+        "add %4, %1                     \n\t"\
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
+        "add %4, %1                     \n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
+        \
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
+        "add %4, %1                     \n\t"  \
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
+        \
+        "add $136, %0                   \n\t"\
+        "add %6, %1                     \n\t"\
+        "decl %2                        \n\t"\
+        " jnz 1b                        \n\t"\
+        \
+        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
+        : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
+        :"memory"\
+    );\
+}\
+\
+static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    uint64_t temp[9*2];\
+    uint64_t *temp_ptr= temp;\
+    int count= 9;\
+\
+    /*FIXME unroll */\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7              \n\t"\
+        "1:                             \n\t"\
+        "movq (%0), %%mm0               \n\t"\
+        "movq (%0), %%mm1               \n\t"\
+        "punpcklbw %%mm7, %%mm0         \n\t"\
+        "punpckhbw %%mm7, %%mm1         \n\t"\
+        "movq %%mm0, (%1)               \n\t"\
+        "movq %%mm1, 9*8(%1)            \n\t"\
+        "add $8, %1                     \n\t"\
+        "add %3, %0                     \n\t"\
+        "decl %2                        \n\t"\
+        " jnz 1b                        \n\t"\
+        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
+        : "r" ((x86_reg)srcStride)\
+        : "memory"\
+    );\
+    \
+    temp_ptr= temp;\
+    count=2;\
+    \
+/*FIXME reorder for speed */\
+    __asm__ volatile(\
+        /*"pxor %%mm7, %%mm7              \n\t"*/\
+        "1:                             \n\t"\
+        "movq (%0), %%mm0               \n\t"\
+        "movq 8(%0), %%mm1              \n\t"\
+        "movq 16(%0), %%mm2             \n\t"\
+        "movq 24(%0), %%mm3             \n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
+        "add %4, %1                     \n\t"\
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
+        \
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
+        "add %4, %1                     \n\t"\
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
+        \
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
+        "add %4, %1                     \n\t"\
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
+                \
+        "add $72, %0                    \n\t"\
+        "add %6, %1                     \n\t"\
+        "decl %2                        \n\t"\
+        " jnz 1b                        \n\t"\
+         \
+        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
+        : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
+        : "memory"\
+   );\
+}\
+\
+static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[8];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[8 + 9];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[9];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t temp[32];\
+    uint8_t * const half= (uint8_t*)temp;\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[16*2 + 17*2];\
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
+    uint8_t * const halfHV= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[17*2];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[17*2];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    uint64_t half[17*2];\
+    uint8_t * const halfH= ((uint8_t*)half);\
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}
+
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgusb " #temp ", " #a "        \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgb " #temp ", " #a "          \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+
+QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
+QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
+QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
+QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
+QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
+QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
+QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
+
+/***********************************/
+/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
+
+#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
+}
+#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
+}
+
+#define QPEL_2TAP(OPNAME, SIZE, MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
+                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
+                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
+                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
+}\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
+}\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
+
+QPEL_2TAP(put_, 16, mmx2)
+QPEL_2TAP(avg_, 16, mmx2)
+QPEL_2TAP(put_,  8, mmx2)
+QPEL_2TAP(avg_,  8, mmx2)
+QPEL_2TAP(put_, 16, 3dnow)
+QPEL_2TAP(avg_, 16, 3dnow)
+QPEL_2TAP(put_,  8, 3dnow)
+QPEL_2TAP(avg_,  8, 3dnow)
+
+
+#if 0
+static void just_return(void) { return; }
+#endif
+
+static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
+                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
+    const int w = 8;
+    const int ix = ox>>(16+shift);
+    const int iy = oy>>(16+shift);
+    const int oxs = ox>>4;
+    const int oys = oy>>4;
+    const int dxxs = dxx>>4;
+    const int dxys = dxy>>4;
+    const int dyxs = dyx>>4;
+    const int dyys = dyy>>4;
+    const uint16_t r4[4] = {r,r,r,r};
+    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
+    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
+    const uint64_t shift2 = 2*shift;
+    uint8_t edge_buf[(h+1)*stride];
+    int x, y;
+
+    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
+    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
+    const int dxh = dxy*(h-1);
+    const int dyw = dyx*(w-1);
+    if( // non-constant fullpel offset (3% of blocks)
+        ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
+         (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
+        // uses more than 16 bits of subpel mv (only at huge resolution)
+        || (dxx|dxy|dyx|dyy)&15 )
+    {
+        //FIXME could still use mmx for some of the rows
+        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
+        return;
+    }
+
+    src += ix + iy*stride;
+    if( (unsigned)ix >= width-w ||
+        (unsigned)iy >= height-h )
+    {
+        ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
+        src = edge_buf;
+    }
+
+    __asm__ volatile(
+        "movd         %0, %%mm6 \n\t"
+        "pxor      %%mm7, %%mm7 \n\t"
+        "punpcklwd %%mm6, %%mm6 \n\t"
+        "punpcklwd %%mm6, %%mm6 \n\t"
+        :: "r"(1<<shift)
+    );
+
+    for(x=0; x<w; x+=4){
+        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
+                            oxs - dxys + dxxs*(x+1),
+                            oxs - dxys + dxxs*(x+2),
+                            oxs - dxys + dxxs*(x+3) };
+        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
+                            oys - dyys + dyxs*(x+1),
+                            oys - dyys + dyxs*(x+2),
+                            oys - dyys + dyxs*(x+3) };
+
+        for(y=0; y<h; y++){
+            __asm__ volatile(
+                "movq   %0,  %%mm4 \n\t"
+                "movq   %1,  %%mm5 \n\t"
+                "paddw  %2,  %%mm4 \n\t"
+                "paddw  %3,  %%mm5 \n\t"
+                "movq   %%mm4, %0  \n\t"
+                "movq   %%mm5, %1  \n\t"
+                "psrlw  $12, %%mm4 \n\t"
+                "psrlw  $12, %%mm5 \n\t"
+                : "+m"(*dx4), "+m"(*dy4)
+                : "m"(*dxy4), "m"(*dyy4)
+            );
+
+            __asm__ volatile(
+                "movq   %%mm6, %%mm2 \n\t"
+                "movq   %%mm6, %%mm1 \n\t"
+                "psubw  %%mm4, %%mm2 \n\t"
+                "psubw  %%mm5, %%mm1 \n\t"
+                "movq   %%mm2, %%mm0 \n\t"
+                "movq   %%mm4, %%mm3 \n\t"
+                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
+                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
+                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
+                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
+
+                "movd   %4,    %%mm5 \n\t"
+                "movd   %3,    %%mm4 \n\t"
+                "punpcklbw %%mm7, %%mm5 \n\t"
+                "punpcklbw %%mm7, %%mm4 \n\t"
+                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
+                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
+
+                "movd   %2,    %%mm5 \n\t"
+                "movd   %1,    %%mm4 \n\t"
+                "punpcklbw %%mm7, %%mm5 \n\t"
+                "punpcklbw %%mm7, %%mm4 \n\t"
+                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
+                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
+                "paddw  %5,    %%mm1 \n\t"
+                "paddw  %%mm3, %%mm2 \n\t"
+                "paddw  %%mm1, %%mm0 \n\t"
+                "paddw  %%mm2, %%mm0 \n\t"
+
+                "psrlw    %6,    %%mm0 \n\t"
+                "packuswb %%mm0, %%mm0 \n\t"
+                "movd     %%mm0, %0    \n\t"
+
+                : "=m"(dst[x+y*stride])
+                : "m"(src[0]), "m"(src[1]),
+                  "m"(src[stride]), "m"(src[stride+1]),
+                  "m"(*r4), "m"(shift2)
+            );
+            src += stride;
+        }
+        src += 4-h*stride;
+    }
+}
+
+#define PREFETCH(name, op) \
+static void name(void *mem, int stride, int h){\
+    const uint8_t *p= mem;\
+    do{\
+        __asm__ volatile(#op" %0" :: "m"(*p));\
+        p+= stride;\
+    }while(--h);\
+}
+PREFETCH(prefetch_mmx2,  prefetcht0)
+PREFETCH(prefetch_3dnow, prefetch)
+#undef PREFETCH
+
+#include "h264dsp_mmx.c"
+#include "rv40dsp_mmx.c"
+
+/* CAVS specific */
+void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
+void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx);
+
+void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    put_pixels8_mmx(dst, src, stride, 8);
+}
+void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    avg_pixels8_mmx(dst, src, stride, 8);
+}
+void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    put_pixels16_mmx(dst, src, stride, 16);
+}
+void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+    avg_pixels16_mmx(dst, src, stride, 16);
+}
+
+/* VC1 specific */
+void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
+
+void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
+    put_pixels8_mmx(dst, src, stride, 8);
+}
+
+/* external functions, from idct_mmx.c */
+void ff_mmx_idct(DCTELEM *block);
+void ff_mmxext_idct(DCTELEM *block);
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+   converted */
+#ifdef CONFIG_GPL
+static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_mmx_idct (block);
+    put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_mmx_idct (block);
+    add_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_mmxext_idct (block);
+    put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_mmxext_idct (block);
+    add_pixels_clamped_mmx(block, dest, line_size);
+}
+#endif
+static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_idct_xvid_mmx (block);
+    put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_idct_xvid_mmx (block);
+    add_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_idct_xvid_mmx2 (block);
+    put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_idct_xvid_mmx2 (block);
+    add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
+{
+    int i;
+    __asm__ volatile("pxor %%mm7, %%mm7":);
+    for(i=0; i<blocksize; i+=2) {
+        __asm__ volatile(
+            "movq    %0,    %%mm0 \n\t"
+            "movq    %1,    %%mm1 \n\t"
+            "movq    %%mm0, %%mm2 \n\t"
+            "movq    %%mm1, %%mm3 \n\t"
+            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
+            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
+            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
+            "pxor    %%mm2, %%mm1 \n\t"
+            "movq    %%mm3, %%mm4 \n\t"
+            "pand    %%mm1, %%mm3 \n\t"
+            "pandn   %%mm1, %%mm4 \n\t"
+            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+            "movq    %%mm3, %1    \n\t"
+            "movq    %%mm0, %0    \n\t"
+            :"+m"(mag[i]), "+m"(ang[i])
+            ::"memory"
+        );
+    }
+    __asm__ volatile("femms");
+}
+static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
+{
+    int i;
+
+    __asm__ volatile(
+            "movaps  %0,     %%xmm5 \n\t"
+        ::"m"(ff_pdw_80000000[0])
+    );
+    for(i=0; i<blocksize; i+=4) {
+        __asm__ volatile(
+            "movaps  %0,     %%xmm0 \n\t"
+            "movaps  %1,     %%xmm1 \n\t"
+            "xorps   %%xmm2, %%xmm2 \n\t"
+            "xorps   %%xmm3, %%xmm3 \n\t"
+            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
+            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
+            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
+            "xorps   %%xmm2, %%xmm1 \n\t"
+            "movaps  %%xmm3, %%xmm4 \n\t"
+            "andps   %%xmm1, %%xmm3 \n\t"
+            "andnps  %%xmm1, %%xmm4 \n\t"
+            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+            "movaps  %%xmm3, %1     \n\t"
+            "movaps  %%xmm0, %0     \n\t"
+            :"+m"(mag[i]), "+m"(ang[i])
+            ::"memory"
+        );
+    }
+}
+
+#define IF1(x) x
+#define IF0(x)
+
+#define MIX5(mono,stereo)\
+    __asm__ volatile(\
+        "movss          0(%2), %%xmm5 \n"\
+        "movss          8(%2), %%xmm6 \n"\
+        "movss         24(%2), %%xmm7 \n"\
+        "shufps    $0, %%xmm5, %%xmm5 \n"\
+        "shufps    $0, %%xmm6, %%xmm6 \n"\
+        "shufps    $0, %%xmm7, %%xmm7 \n"\
+        "1: \n"\
+        "movaps       (%0,%1), %%xmm0 \n"\
+        "movaps  0x400(%0,%1), %%xmm1 \n"\
+        "movaps  0x800(%0,%1), %%xmm2 \n"\
+        "movaps  0xc00(%0,%1), %%xmm3 \n"\
+        "movaps 0x1000(%0,%1), %%xmm4 \n"\
+        "mulps         %%xmm5, %%xmm0 \n"\
+        "mulps         %%xmm6, %%xmm1 \n"\
+        "mulps         %%xmm5, %%xmm2 \n"\
+        "mulps         %%xmm7, %%xmm3 \n"\
+        "mulps         %%xmm7, %%xmm4 \n"\
+ stereo("addps         %%xmm1, %%xmm0 \n")\
+        "addps         %%xmm1, %%xmm2 \n"\
+        "addps         %%xmm3, %%xmm0 \n"\
+        "addps         %%xmm4, %%xmm2 \n"\
+   mono("addps         %%xmm2, %%xmm0 \n")\
+        "movaps  %%xmm0,      (%0,%1) \n"\
+ stereo("movaps  %%xmm2, 0x400(%0,%1) \n")\
+        "add $16, %0 \n"\
+        "jl 1b \n"\
+        :"+&r"(i)\
+        :"r"(samples[0]+len), "r"(matrix)\
+        :"memory"\
+    );
+
+#define MIX_MISC(stereo)\
+    __asm__ volatile(\
+        "1: \n"\
+        "movaps  (%3,%0), %%xmm0 \n"\
+ stereo("movaps   %%xmm0, %%xmm1 \n")\
+        "mulps    %%xmm6, %%xmm0 \n"\
+ stereo("mulps    %%xmm7, %%xmm1 \n")\
+        "lea 1024(%3,%0), %1 \n"\
+        "mov %5, %2 \n"\
+        "2: \n"\
+        "movaps   (%1),   %%xmm2 \n"\
+ stereo("movaps   %%xmm2, %%xmm3 \n")\
+        "mulps   (%4,%2), %%xmm2 \n"\
+ stereo("mulps 16(%4,%2), %%xmm3 \n")\
+        "addps    %%xmm2, %%xmm0 \n"\
+ stereo("addps    %%xmm3, %%xmm1 \n")\
+        "add $1024, %1 \n"\
+        "add $32, %2 \n"\
+        "jl 2b \n"\
+        "movaps   %%xmm0,     (%3,%0) \n"\
+ stereo("movaps   %%xmm1, 1024(%3,%0) \n")\
+        "add $16, %0 \n"\
+        "jl 1b \n"\
+        :"+&r"(i), "=&r"(j), "=&r"(k)\
+        :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
+        :"memory"\
+    );
+
+static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
+{
+    int (*matrix_cmp)[2] = (int(*)[2])matrix;
+    intptr_t i,j,k;
+
+    i = -len*sizeof(float);
+    if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
+        MIX5(IF0,IF1);
+    } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
+        MIX5(IF1,IF0);
+    } else {
+        DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]);
+        j = 2*in_ch*sizeof(float);
+        __asm__ volatile(
+            "1: \n"
+            "sub $8, %0 \n"
+            "movss     (%2,%0), %%xmm6 \n"
+            "movss    4(%2,%0), %%xmm7 \n"
+            "shufps $0, %%xmm6, %%xmm6 \n"
+            "shufps $0, %%xmm7, %%xmm7 \n"
+            "movaps %%xmm6,   (%1,%0,4) \n"
+            "movaps %%xmm7, 16(%1,%0,4) \n"
+            "jg 1b \n"
+            :"+&r"(j)
+            :"r"(matrix_simd), "r"(matrix)
+            :"memory"
+        );
+        if(out_ch == 2) {
+            MIX_MISC(IF1);
+        } else {
+            MIX_MISC(IF0);
+        }
+    }
+}
+
+static void vector_fmul_3dnow(float *dst, const float *src, int len){
+    x86_reg i = (len-4)*4;
+    __asm__ volatile(
+        "1: \n\t"
+        "movq    (%1,%0), %%mm0 \n\t"
+        "movq   8(%1,%0), %%mm1 \n\t"
+        "pfmul   (%2,%0), %%mm0 \n\t"
+        "pfmul  8(%2,%0), %%mm1 \n\t"
+        "movq   %%mm0,  (%1,%0) \n\t"
+        "movq   %%mm1, 8(%1,%0) \n\t"
+        "sub  $16, %0 \n\t"
+        "jge 1b \n\t"
+        "femms  \n\t"
+        :"+r"(i)
+        :"r"(dst), "r"(src)
+        :"memory"
+    );
+}
+static void vector_fmul_sse(float *dst, const float *src, int len){
+    x86_reg i = (len-8)*4;
+    __asm__ volatile(
+        "1: \n\t"
+        "movaps    (%1,%0), %%xmm0 \n\t"
+        "movaps  16(%1,%0), %%xmm1 \n\t"
+        "mulps     (%2,%0), %%xmm0 \n\t"
+        "mulps   16(%2,%0), %%xmm1 \n\t"
+        "movaps  %%xmm0,   (%1,%0) \n\t"
+        "movaps  %%xmm1, 16(%1,%0) \n\t"
+        "sub  $32, %0 \n\t"
+        "jge 1b \n\t"
+        :"+r"(i)
+        :"r"(dst), "r"(src)
+        :"memory"
+    );
+}
+
+static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
+    x86_reg i = len*4-16;
+    __asm__ volatile(
+        "1: \n\t"
+        "pswapd   8(%1), %%mm0 \n\t"
+        "pswapd    (%1), %%mm1 \n\t"
+        "pfmul  (%3,%0), %%mm0 \n\t"
+        "pfmul 8(%3,%0), %%mm1 \n\t"
+        "movq  %%mm0,  (%2,%0) \n\t"
+        "movq  %%mm1, 8(%2,%0) \n\t"
+        "add   $16, %1 \n\t"
+        "sub   $16, %0 \n\t"
+        "jge   1b \n\t"
+        :"+r"(i), "+r"(src1)
+        :"r"(dst), "r"(src0)
+    );
+    __asm__ volatile("femms");
+}
+static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
+    x86_reg i = len*4-32;
+    __asm__ volatile(
+        "1: \n\t"
+        "movaps        16(%1), %%xmm0 \n\t"
+        "movaps          (%1), %%xmm1 \n\t"
+        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
+        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
+        "mulps        (%3,%0), %%xmm0 \n\t"
+        "mulps      16(%3,%0), %%xmm1 \n\t"
+        "movaps     %%xmm0,   (%2,%0) \n\t"
+        "movaps     %%xmm1, 16(%2,%0) \n\t"
+        "add    $32, %1 \n\t"
+        "sub    $32, %0 \n\t"
+        "jge    1b \n\t"
+        :"+r"(i), "+r"(src1)
+        :"r"(dst), "r"(src0)
+    );
+}
+
+static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
+                                      const float *src2, int src3, int len, int step){
+    x86_reg i = (len-4)*4;
+    if(step == 2 && src3 == 0){
+        dst += (len-4)*2;
+        __asm__ volatile(
+            "1: \n\t"
+            "movq   (%2,%0),  %%mm0 \n\t"
+            "movq  8(%2,%0),  %%mm1 \n\t"
+            "pfmul  (%3,%0),  %%mm0 \n\t"
+            "pfmul 8(%3,%0),  %%mm1 \n\t"
+            "pfadd  (%4,%0),  %%mm0 \n\t"
+            "pfadd 8(%4,%0),  %%mm1 \n\t"
+            "movd     %%mm0,   (%1) \n\t"
+            "movd     %%mm1, 16(%1) \n\t"
+            "psrlq      $32,  %%mm0 \n\t"
+            "psrlq      $32,  %%mm1 \n\t"
+            "movd     %%mm0,  8(%1) \n\t"
+            "movd     %%mm1, 24(%1) \n\t"
+            "sub  $32, %1 \n\t"
+            "sub  $16, %0 \n\t"
+            "jge  1b \n\t"
+            :"+r"(i), "+r"(dst)
+            :"r"(src0), "r"(src1), "r"(src2)
+            :"memory"
+        );
+    }
+    else if(step == 1 && src3 == 0){
+        __asm__ volatile(
+            "1: \n\t"
+            "movq    (%2,%0), %%mm0 \n\t"
+            "movq   8(%2,%0), %%mm1 \n\t"
+            "pfmul   (%3,%0), %%mm0 \n\t"
+            "pfmul  8(%3,%0), %%mm1 \n\t"
+            "pfadd   (%4,%0), %%mm0 \n\t"
+            "pfadd  8(%4,%0), %%mm1 \n\t"
+            "movq  %%mm0,   (%1,%0) \n\t"
+            "movq  %%mm1,  8(%1,%0) \n\t"
+            "sub  $16, %0 \n\t"
+            "jge  1b \n\t"
+            :"+r"(i)
+            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
+            :"memory"
+        );
+    }
+    else
+        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+    __asm__ volatile("femms");
+}
+static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
+                                    const float *src2, int src3, int len, int step){
+    x86_reg i = (len-8)*4;
+    if(step == 2 && src3 == 0){
+        dst += (len-8)*2;
+        __asm__ volatile(
+            "1: \n\t"
+            "movaps   (%2,%0), %%xmm0 \n\t"
+            "movaps 16(%2,%0), %%xmm1 \n\t"
+            "mulps    (%3,%0), %%xmm0 \n\t"
+            "mulps  16(%3,%0), %%xmm1 \n\t"
+            "addps    (%4,%0), %%xmm0 \n\t"
+            "addps  16(%4,%0), %%xmm1 \n\t"
+            "movss     %%xmm0,   (%1) \n\t"
+            "movss     %%xmm1, 32(%1) \n\t"
+            "movhlps   %%xmm0, %%xmm2 \n\t"
+            "movhlps   %%xmm1, %%xmm3 \n\t"
+            "movss     %%xmm2, 16(%1) \n\t"
+            "movss     %%xmm3, 48(%1) \n\t"
+            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
+            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
+            "movss     %%xmm0,  8(%1) \n\t"
+            "movss     %%xmm1, 40(%1) \n\t"
+            "movhlps   %%xmm0, %%xmm2 \n\t"
+            "movhlps   %%xmm1, %%xmm3 \n\t"
+            "movss     %%xmm2, 24(%1) \n\t"
+            "movss     %%xmm3, 56(%1) \n\t"
+            "sub  $64, %1 \n\t"
+            "sub  $32, %0 \n\t"
+            "jge  1b \n\t"
+            :"+r"(i), "+r"(dst)
+            :"r"(src0), "r"(src1), "r"(src2)
+            :"memory"
+        );
+    }
+    else if(step == 1 && src3 == 0){
+        __asm__ volatile(
+            "1: \n\t"
+            "movaps   (%2,%0), %%xmm0 \n\t"
+            "movaps 16(%2,%0), %%xmm1 \n\t"
+            "mulps    (%3,%0), %%xmm0 \n\t"
+            "mulps  16(%3,%0), %%xmm1 \n\t"
+            "addps    (%4,%0), %%xmm0 \n\t"
+            "addps  16(%4,%0), %%xmm1 \n\t"
+            "movaps %%xmm0,   (%1,%0) \n\t"
+            "movaps %%xmm1, 16(%1,%0) \n\t"
+            "sub  $32, %0 \n\t"
+            "jge  1b \n\t"
+            :"+r"(i)
+            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
+            :"memory"
+        );
+    }
+    else
+        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+}
+
+static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
+                                      const float *win, float add_bias, int len){
+#ifdef HAVE_6REGS
+    if(add_bias == 0){
+        x86_reg i = -len*4;
+        x86_reg j = len*4-8;
+        __asm__ volatile(
+            "1: \n"
+            "pswapd  (%5,%1), %%mm1 \n"
+            "movq    (%5,%0), %%mm0 \n"
+            "pswapd  (%4,%1), %%mm5 \n"
+            "movq    (%3,%0), %%mm4 \n"
+            "movq      %%mm0, %%mm2 \n"
+            "movq      %%mm1, %%mm3 \n"
+            "pfmul     %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
+            "pfmul     %%mm5, %%mm3 \n" // src1[    j]*win[len+j]
+            "pfmul     %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
+            "pfmul     %%mm5, %%mm0 \n" // src1[    j]*win[len+i]
+            "pfadd     %%mm3, %%mm2 \n"
+            "pfsub     %%mm0, %%mm1 \n"
+            "pswapd    %%mm2, %%mm2 \n"
+            "movq      %%mm1, (%2,%0) \n"
+            "movq      %%mm2, (%2,%1) \n"
+            "sub $8, %1 \n"
+            "add $8, %0 \n"
+            "jl 1b \n"
+            "femms \n"
+            :"+r"(i), "+r"(j)
+            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
+        );
+    }else
+#endif
+        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
+}
+
+static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
+                                   const float *win, float add_bias, int len){
+#ifdef HAVE_6REGS
+    if(add_bias == 0){
+        x86_reg i = -len*4;
+        x86_reg j = len*4-16;
+        __asm__ volatile(
+            "1: \n"
+            "movaps       (%5,%1), %%xmm1 \n"
+            "movaps       (%5,%0), %%xmm0 \n"
+            "movaps       (%4,%1), %%xmm5 \n"
+            "movaps       (%3,%0), %%xmm4 \n"
+            "shufps $0x1b, %%xmm1, %%xmm1 \n"
+            "shufps $0x1b, %%xmm5, %%xmm5 \n"
+            "movaps        %%xmm0, %%xmm2 \n"
+            "movaps        %%xmm1, %%xmm3 \n"
+            "mulps         %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
+            "mulps         %%xmm5, %%xmm3 \n" // src1[    j]*win[len+j]
+            "mulps         %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
+            "mulps         %%xmm5, %%xmm0 \n" // src1[    j]*win[len+i]
+            "addps         %%xmm3, %%xmm2 \n"
+            "subps         %%xmm0, %%xmm1 \n"
+            "shufps $0x1b, %%xmm2, %%xmm2 \n"
+            "movaps        %%xmm1, (%2,%0) \n"
+            "movaps        %%xmm2, (%2,%1) \n"
+            "sub $16, %1 \n"
+            "add $16, %0 \n"
+            "jl 1b \n"
+            :"+r"(i), "+r"(j)
+            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
+        );
+    }else
+#endif
+        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
+}
+
+static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
+{
+    x86_reg i = -4*len;
+    __asm__ volatile(
+        "movss  %3, %%xmm4 \n"
+        "shufps $0, %%xmm4, %%xmm4 \n"
+        "1: \n"
+        "cvtpi2ps   (%2,%0), %%xmm0 \n"
+        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
+        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
+        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
+        "movlhps  %%xmm1,    %%xmm0 \n"
+        "movlhps  %%xmm3,    %%xmm2 \n"
+        "mulps    %%xmm4,    %%xmm0 \n"
+        "mulps    %%xmm4,    %%xmm2 \n"
+        "movaps   %%xmm0,   (%1,%0) \n"
+        "movaps   %%xmm2, 16(%1,%0) \n"
+        "add $32, %0 \n"
+        "jl 1b \n"
+        :"+r"(i)
+        :"r"(dst+len), "r"(src+len), "m"(mul)
+    );
+}
+
+static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
+{
+    x86_reg i = -4*len;
+    __asm__ volatile(
+        "movss  %3, %%xmm4 \n"
+        "shufps $0, %%xmm4, %%xmm4 \n"
+        "1: \n"
+        "cvtdq2ps   (%2,%0), %%xmm0 \n"
+        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
+        "mulps    %%xmm4,    %%xmm0 \n"
+        "mulps    %%xmm4,    %%xmm1 \n"
+        "movaps   %%xmm0,   (%1,%0) \n"
+        "movaps   %%xmm1, 16(%1,%0) \n"
+        "add $32, %0 \n"
+        "jl 1b \n"
+        :"+r"(i)
+        :"r"(dst+len), "r"(src+len), "m"(mul)
+    );
+}
+
+static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
+    x86_reg reglen = len;
+    // not bit-exact: pf2id uses different rounding than C and SSE
+    __asm__ volatile(
+        "add        %0          , %0        \n\t"
+        "lea         (%2,%0,2)  , %2        \n\t"
+        "add        %0          , %1        \n\t"
+        "neg        %0                      \n\t"
+        "1:                                 \n\t"
+        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
+        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
+        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
+        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
+        "packssdw   %%mm1       , %%mm0     \n\t"
+        "packssdw   %%mm3       , %%mm2     \n\t"
+        "movq       %%mm0       ,  (%1,%0)  \n\t"
+        "movq       %%mm2       , 8(%1,%0)  \n\t"
+        "add        $16         , %0        \n\t"
+        " js 1b                             \n\t"
+        "femms                              \n\t"
+        :"+r"(reglen), "+r"(dst), "+r"(src)
+    );
+}
+static void float_to_int16_sse(int16_t *dst, const float *src, long len){
+    x86_reg reglen = len;
+    __asm__ volatile(
+        "add        %0          , %0        \n\t"
+        "lea         (%2,%0,2)  , %2        \n\t"
+        "add        %0          , %1        \n\t"
+        "neg        %0                      \n\t"
+        "1:                                 \n\t"
+        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
+        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
+        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
+        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
+        "packssdw   %%mm1       , %%mm0     \n\t"
+        "packssdw   %%mm3       , %%mm2     \n\t"
+        "movq       %%mm0       ,  (%1,%0)  \n\t"
+        "movq       %%mm2       , 8(%1,%0)  \n\t"
+        "add        $16         , %0        \n\t"
+        " js 1b                             \n\t"
+        "emms                               \n\t"
+        :"+r"(reglen), "+r"(dst), "+r"(src)
+    );
+}
+
+static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
+    x86_reg reglen = len;
+    __asm__ volatile(
+        "add        %0          , %0        \n\t"
+        "lea         (%2,%0,2)  , %2        \n\t"
+        "add        %0          , %1        \n\t"
+        "neg        %0                      \n\t"
+        "1:                                 \n\t"
+        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
+        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
+        "packssdw   %%xmm1      , %%xmm0    \n\t"
+        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
+        "add        $16         , %0        \n\t"
+        " js 1b                             \n\t"
+        :"+r"(reglen), "+r"(dst), "+r"(src)
+    );
+}
+
+#ifdef HAVE_YASM
+void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
+void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
+void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
+void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
+void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
+#ifdef ARCH_X86_32
+static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
+{
+    ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
+    ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
+}
+#endif
+void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
+void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
+#else
+#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
+#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
+#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
+#endif
+#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
+
+#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
+static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
+    DECLARE_ALIGNED_16(int16_t, tmp[len]);\
+    int i,j,c;\
+    for(c=0; c<channels; c++){\
+        float_to_int16_##cpu(tmp, src[c], len);\
+        for(i=0, j=c; i<len; i++, j+=channels)\
+            dst[j] = tmp[i];\
+    }\
+}\
+\
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
+    if(channels==1)\
+        float_to_int16_##cpu(dst, src[0], len);\
+    else if(channels==2){\
+        x86_reg reglen = len; \
+        const float *src0 = src[0];\
+        const float *src1 = src[1];\
+        __asm__ volatile(\
+            "shl $2, %0 \n"\
+            "add %0, %1 \n"\
+            "add %0, %2 \n"\
+            "add %0, %3 \n"\
+            "neg %0 \n"\
+            body\
+            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
+        );\
+    }else if(channels==6){\
+        ff_float_to_int16_interleave6_##cpu(dst, src, len);\
+    }else\
+        float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
+}
+
+FLOAT_TO_INT16_INTERLEAVE(3dnow,
+    "1:                         \n"
+    "pf2id     (%2,%0), %%mm0   \n"
+    "pf2id    8(%2,%0), %%mm1   \n"
+    "pf2id     (%3,%0), %%mm2   \n"
+    "pf2id    8(%3,%0), %%mm3   \n"
+    "packssdw    %%mm1, %%mm0   \n"
+    "packssdw    %%mm3, %%mm2   \n"
+    "movq        %%mm0, %%mm1   \n"
+    "punpcklwd   %%mm2, %%mm0   \n"
+    "punpckhwd   %%mm2, %%mm1   \n"
+    "movq        %%mm0,  (%1,%0)\n"
+    "movq        %%mm1, 8(%1,%0)\n"
+    "add $16, %0                \n"
+    "js 1b                      \n"
+    "femms                      \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse,
+    "1:                         \n"
+    "cvtps2pi  (%2,%0), %%mm0   \n"
+    "cvtps2pi 8(%2,%0), %%mm1   \n"
+    "cvtps2pi  (%3,%0), %%mm2   \n"
+    "cvtps2pi 8(%3,%0), %%mm3   \n"
+    "packssdw    %%mm1, %%mm0   \n"
+    "packssdw    %%mm3, %%mm2   \n"
+    "movq        %%mm0, %%mm1   \n"
+    "punpcklwd   %%mm2, %%mm0   \n"
+    "punpckhwd   %%mm2, %%mm1   \n"
+    "movq        %%mm0,  (%1,%0)\n"
+    "movq        %%mm1, 8(%1,%0)\n"
+    "add $16, %0                \n"
+    "js 1b                      \n"
+    "emms                       \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse2,
+    "1:                         \n"
+    "cvtps2dq  (%2,%0), %%xmm0  \n"
+    "cvtps2dq  (%3,%0), %%xmm1  \n"
+    "packssdw   %%xmm1, %%xmm0  \n"
+    "movhlps    %%xmm0, %%xmm1  \n"
+    "punpcklwd  %%xmm1, %%xmm0  \n"
+    "movdqa     %%xmm0, (%1,%0) \n"
+    "add $16, %0                \n"
+    "js 1b                      \n"
+)
+
+static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
+    if(channels==6)
+        ff_float_to_int16_interleave6_3dn2(dst, src, len);
+    else
+        float_to_int16_interleave_3dnow(dst, src, len, channels);
+}
+
+
+void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
+void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
+void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
+void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
+void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                                   int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                                  int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+
+static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+    x86_reg o = -(order << 1);
+    v1 += order;
+    v2 += order;
+    __asm__ volatile(
+        "1:                          \n\t"
+        "movdqu   (%1,%2),   %%xmm0  \n\t"
+        "movdqu 16(%1,%2),   %%xmm1  \n\t"
+        "paddw    (%0,%2),   %%xmm0  \n\t"
+        "paddw  16(%0,%2),   %%xmm1  \n\t"
+        "movdqa   %%xmm0,    (%0,%2) \n\t"
+        "movdqa   %%xmm1,  16(%0,%2) \n\t"
+        "add      $32,       %2      \n\t"
+        "js       1b                 \n\t"
+        : "+r"(v1), "+r"(v2), "+r"(o)
+    );
+}
+
+static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+    x86_reg o = -(order << 1);
+    v1 += order;
+    v2 += order;
+    __asm__ volatile(
+        "1:                           \n\t"
+        "movdqa    (%0,%2),   %%xmm0  \n\t"
+        "movdqa  16(%0,%2),   %%xmm2  \n\t"
+        "movdqu    (%1,%2),   %%xmm1  \n\t"
+        "movdqu  16(%1,%2),   %%xmm3  \n\t"
+        "psubw     %%xmm1,    %%xmm0  \n\t"
+        "psubw     %%xmm3,    %%xmm2  \n\t"
+        "movdqa    %%xmm0,    (%0,%2) \n\t"
+        "movdqa    %%xmm2,  16(%0,%2) \n\t"
+        "add       $32,       %2      \n\t"
+        "js        1b                 \n\t"
+        : "+r"(v1), "+r"(v2), "+r"(o)
+    );
+}
+
+static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
+{
+    int res = 0;
+    DECLARE_ALIGNED_16(int64_t, sh);
+    x86_reg o = -(order << 1);
+
+    v1 += order;
+    v2 += order;
+    sh = shift;
+    __asm__ volatile(
+        "pxor      %%xmm7,  %%xmm7        \n\t"
+        "1:                               \n\t"
+        "movdqu    (%0,%3), %%xmm0        \n\t"
+        "movdqu  16(%0,%3), %%xmm1        \n\t"
+        "pmaddwd   (%1,%3), %%xmm0        \n\t"
+        "pmaddwd 16(%1,%3), %%xmm1        \n\t"
+        "paddd     %%xmm0,  %%xmm7        \n\t"
+        "paddd     %%xmm1,  %%xmm7        \n\t"
+        "add       $32,     %3            \n\t"
+        "js        1b                     \n\t"
+        "movhlps   %%xmm7,  %%xmm2        \n\t"
+        "paddd     %%xmm2,  %%xmm7        \n\t"
+        "psrad     %4,      %%xmm7        \n\t"
+        "pshuflw   $0x4E,   %%xmm7,%%xmm2 \n\t"
+        "paddd     %%xmm2,  %%xmm7        \n\t"
+        "movd      %%xmm7,  %2            \n\t"
+        : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
+        : "m"(sh)
+    );
+    return res;
+}
+
+void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
+{
+    mm_flags = mm_support();
+
+    if (avctx->dsp_mask) {
+        if (avctx->dsp_mask & FF_MM_FORCE)
+            mm_flags |= (avctx->dsp_mask & 0xffff);
+        else
+            mm_flags &= ~(avctx->dsp_mask & 0xffff);
+    }
+
+#if 0
+    av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
+    if (mm_flags & FF_MM_MMX)
+        av_log(avctx, AV_LOG_INFO, " mmx");
+    if (mm_flags & FF_MM_MMXEXT)
+        av_log(avctx, AV_LOG_INFO, " mmxext");
+    if (mm_flags & FF_MM_3DNOW)
+        av_log(avctx, AV_LOG_INFO, " 3dnow");
+    if (mm_flags & FF_MM_SSE)
+        av_log(avctx, AV_LOG_INFO, " sse");
+    if (mm_flags & FF_MM_SSE2)
+        av_log(avctx, AV_LOG_INFO, " sse2");
+    av_log(avctx, AV_LOG_INFO, "\n");
+#endif
+
+    if (mm_flags & FF_MM_MMX) {
+        const int idct_algo= avctx->idct_algo;
+
+        if(avctx->lowres==0){
+            if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
+                c->idct_put= ff_simple_idct_put_mmx;
+                c->idct_add= ff_simple_idct_add_mmx;
+                c->idct    = ff_simple_idct_mmx;
+                c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
+#ifdef CONFIG_GPL
+            }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
+                if(mm_flags & FF_MM_MMXEXT){
+                    c->idct_put= ff_libmpeg2mmx2_idct_put;
+                    c->idct_add= ff_libmpeg2mmx2_idct_add;
+                    c->idct    = ff_mmxext_idct;
+                }else{
+                    c->idct_put= ff_libmpeg2mmx_idct_put;
+                    c->idct_add= ff_libmpeg2mmx_idct_add;
+                    c->idct    = ff_mmx_idct;
+                }
+                c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
+#endif
+            }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER) &&
+                     idct_algo==FF_IDCT_VP3){
+                if(mm_flags & FF_MM_SSE2){
+                    c->idct_put= ff_vp3_idct_put_sse2;
+                    c->idct_add= ff_vp3_idct_add_sse2;
+                    c->idct    = ff_vp3_idct_sse2;
+                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
+                }else{
+                    c->idct_put= ff_vp3_idct_put_mmx;
+                    c->idct_add= ff_vp3_idct_add_mmx;
+                    c->idct    = ff_vp3_idct_mmx;
+                    c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
+                }
+            }else if(idct_algo==FF_IDCT_CAVS){
+                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
+            }else if(idct_algo==FF_IDCT_XVIDMMX){
+                if(mm_flags & FF_MM_SSE2){
+                    c->idct_put= ff_idct_xvid_sse2_put;
+                    c->idct_add= ff_idct_xvid_sse2_add;
+                    c->idct    = ff_idct_xvid_sse2;
+                    c->idct_permutation_type= FF_SSE2_IDCT_PERM;
+                }else if(mm_flags & FF_MM_MMXEXT){
+                    c->idct_put= ff_idct_xvid_mmx2_put;
+                    c->idct_add= ff_idct_xvid_mmx2_add;
+                    c->idct    = ff_idct_xvid_mmx2;
+                }else{
+                    c->idct_put= ff_idct_xvid_mmx_put;
+                    c->idct_add= ff_idct_xvid_mmx_add;
+                    c->idct    = ff_idct_xvid_mmx;
+                }
+            }
+        }
+
+        c->put_pixels_clamped = put_pixels_clamped_mmx;
+        c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
+        c->add_pixels_clamped = add_pixels_clamped_mmx;
+        c->clear_block  = clear_block_mmx;
+        c->clear_blocks = clear_blocks_mmx;
+        if (mm_flags & FF_MM_SSE)
+            c->clear_block = clear_block_sse;
+
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
+        c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
+        c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
+        c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
+        c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
+
+        SET_HPEL_FUNCS(put, 0, 16, mmx);
+        SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
+        SET_HPEL_FUNCS(avg, 0, 16, mmx);
+        SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
+        SET_HPEL_FUNCS(put, 1, 8, mmx);
+        SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
+        SET_HPEL_FUNCS(avg, 1, 8, mmx);
+        SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
+
+        c->gmc= gmc_mmx;
+
+        c->add_bytes= add_bytes_mmx;
+        c->add_bytes_l2= add_bytes_l2_mmx;
+
+        c->draw_edges = draw_edges_mmx;
+
+        if (ENABLE_ANY_H263) {
+            c->h263_v_loop_filter= h263_v_loop_filter_mmx;
+            c->h263_h_loop_filter= h263_h_loop_filter_mmx;
+        }
+        c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
+        c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
+        c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
+
+        c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_mmx;
+        c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_mmx;
+
+        c->h264_idct_dc_add=
+        c->h264_idct_add= ff_h264_idct_add_mmx;
+        c->h264_idct8_dc_add=
+        c->h264_idct8_add= ff_h264_idct8_add_mmx;
+
+        c->h264_idct_add16     = ff_h264_idct_add16_mmx;
+        c->h264_idct8_add4     = ff_h264_idct8_add4_mmx;
+        c->h264_idct_add8      = ff_h264_idct_add8_mmx;
+        c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
+
+        if (mm_flags & FF_MM_MMXEXT) {
+            c->prefetch = prefetch_mmx2;
+
+            c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
+            c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
+
+            c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
+            c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
+            c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
+
+            c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
+            c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
+
+            c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
+            c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
+            c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
+
+            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
+            c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
+            c->h264_idct_add16     = ff_h264_idct_add16_mmx2;
+            c->h264_idct8_add4     = ff_h264_idct8_add4_mmx2;
+            c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
+            c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
+
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+                c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
+                c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
+                c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
+                c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
+                c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
+                c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
+
+                if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
+                    c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
+                    c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
+                }
+            }
+
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
+            c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
+            c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
+
+            SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
+            SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
+            SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
+            SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
+            SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
+            SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
+
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
+            SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
+            SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
+            SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
+            SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
+
+            SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
+            SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
+            SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
+            SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
+
+            c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_mmx2;
+            c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_mmx2;
+
+            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
+            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
+            c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
+            c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
+            c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
+            c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
+            c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
+            c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
+            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
+            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
+            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
+
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
+            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
+            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
+            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
+            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
+
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
+            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
+            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
+            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
+            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
+            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+
+            if (ENABLE_CAVS_DECODER)
+                ff_cavsdsp_init_mmx2(c, avctx);
+
+            if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
+                ff_vc1dsp_init_mmx(c, avctx);
+
+            c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
+        } else if (mm_flags & FF_MM_3DNOW) {
+            c->prefetch = prefetch_3dnow;
+
+            c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
+            c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
+
+            c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
+            c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
+            c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+
+            c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
+            c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
+
+            c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
+            c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
+            c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
+
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+                c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
+                c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
+                c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
+                c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
+                c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
+                c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
+            }
+
+            SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
+            SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
+            SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
+            SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
+            SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
+            SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
+
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
+            SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
+            SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
+            SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
+            SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
+
+            SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
+            SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
+            SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
+            SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
+
+            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
+            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
+
+            c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_3dnow;
+            c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_3dnow;
+
+            if (ENABLE_CAVS_DECODER)
+                ff_cavsdsp_init_3dnow(c, avctx);
+        }
+
+
+#define H264_QPEL_FUNCS(x, y, CPU)\
+            c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
+            c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
+            c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
+            c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
+        if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){
+            // these functions are slower than mmx on AMD, but faster on Intel
+/* FIXME works in most codecs, but crashes svq1 due to unaligned chroma
+            c->put_pixels_tab[0][0] = put_pixels16_sse2;
+            c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
+*/
+            H264_QPEL_FUNCS(0, 0, sse2);
+        }
+        if(mm_flags & FF_MM_SSE2){
+            c->h264_idct8_add = ff_h264_idct8_add_sse2;
+            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
+
+            H264_QPEL_FUNCS(0, 1, sse2);
+            H264_QPEL_FUNCS(0, 2, sse2);
+            H264_QPEL_FUNCS(0, 3, sse2);
+            H264_QPEL_FUNCS(1, 1, sse2);
+            H264_QPEL_FUNCS(1, 2, sse2);
+            H264_QPEL_FUNCS(1, 3, sse2);
+            H264_QPEL_FUNCS(2, 1, sse2);
+            H264_QPEL_FUNCS(2, 2, sse2);
+            H264_QPEL_FUNCS(2, 3, sse2);
+            H264_QPEL_FUNCS(3, 1, sse2);
+            H264_QPEL_FUNCS(3, 2, sse2);
+            H264_QPEL_FUNCS(3, 3, sse2);
+        }
+#ifdef HAVE_SSSE3
+        if(mm_flags & FF_MM_SSSE3){
+            H264_QPEL_FUNCS(1, 0, ssse3);
+            H264_QPEL_FUNCS(1, 1, ssse3);
+            H264_QPEL_FUNCS(1, 2, ssse3);
+            H264_QPEL_FUNCS(1, 3, ssse3);
+            H264_QPEL_FUNCS(2, 0, ssse3);
+            H264_QPEL_FUNCS(2, 1, ssse3);
+            H264_QPEL_FUNCS(2, 2, ssse3);
+            H264_QPEL_FUNCS(2, 3, ssse3);
+            H264_QPEL_FUNCS(3, 0, ssse3);
+            H264_QPEL_FUNCS(3, 1, ssse3);
+            H264_QPEL_FUNCS(3, 2, ssse3);
+            H264_QPEL_FUNCS(3, 3, ssse3);
+            c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd;
+            c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
+            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
+            c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
+            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
+            c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
+        }
+#endif
+
+#if defined(CONFIG_GPL) && defined(HAVE_YASM)
+        if( mm_flags&FF_MM_MMXEXT ){
+#ifdef ARCH_X86_32
+            c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
+            c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
+#endif
+            if( mm_flags&FF_MM_SSE2 ){
+#if defined(ARCH_X86_64) || !defined(__ICC) || __ICC > 1100
+                c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
+                c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
+                c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
+                c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
+#endif
+                c->h264_idct_add16 = ff_h264_idct_add16_sse2;
+                c->h264_idct_add8  = ff_h264_idct_add8_sse2;
+                c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
+            }
+        }
+#endif
+
+#ifdef CONFIG_SNOW_DECODER
+        if(mm_flags & FF_MM_SSE2 & 0){
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#ifdef HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+        }
+        else{
+            if(mm_flags & FF_MM_MMXEXT){
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#ifdef HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+            }
+            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+        }
+#endif
+
+        if(mm_flags & FF_MM_3DNOW){
+            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+            c->vector_fmul = vector_fmul_3dnow;
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+                c->float_to_int16 = float_to_int16_3dnow;
+                c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
+            }
+        }
+        if(mm_flags & FF_MM_3DNOWEXT){
+            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
+            c->vector_fmul_window = vector_fmul_window_3dnow2;
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
+            }
+        }
+        if(mm_flags & FF_MM_SSE){
+            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
+            c->ac3_downmix = ac3_downmix_sse;
+            c->vector_fmul = vector_fmul_sse;
+            c->vector_fmul_reverse = vector_fmul_reverse_sse;
+            c->vector_fmul_add_add = vector_fmul_add_add_sse;
+            c->vector_fmul_window = vector_fmul_window_sse;
+            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
+            c->float_to_int16 = float_to_int16_sse;
+            c->float_to_int16_interleave = float_to_int16_interleave_sse;
+        }
+        if(mm_flags & FF_MM_3DNOW)
+            c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+        if(mm_flags & FF_MM_SSE2){
+            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
+            c->float_to_int16 = float_to_int16_sse2;
+            c->float_to_int16_interleave = float_to_int16_interleave_sse2;
+            c->add_int16 = add_int16_sse2;
+            c->sub_int16 = sub_int16_sse2;
+            c->scalarproduct_int16 = scalarproduct_int16_sse2;
+        }
+    }
+
+    if (ENABLE_ENCODERS)
+        dsputilenc_init_mmx(c, avctx);
+
+#if 0
+    // for speed testing
+    get_pixels = just_return;
+    put_pixels_clamped = just_return;
+    add_pixels_clamped = just_return;
+
+    pix_abs16x16 = just_return;
+    pix_abs16x16_x2 = just_return;
+    pix_abs16x16_y2 = just_return;
+    pix_abs16x16_xy2 = just_return;
+
+    put_pixels_tab[0] = just_return;
+    put_pixels_tab[1] = just_return;
+    put_pixels_tab[2] = just_return;
+    put_pixels_tab[3] = just_return;
+
+    put_no_rnd_pixels_tab[0] = just_return;
+    put_no_rnd_pixels_tab[1] = just_return;
+    put_no_rnd_pixels_tab[2] = just_return;
+    put_no_rnd_pixels_tab[3] = just_return;
+
+    avg_pixels_tab[0] = just_return;
+    avg_pixels_tab[1] = just_return;
+    avg_pixels_tab[2] = just_return;
+    avg_pixels_tab[3] = just_return;
+
+    avg_no_rnd_pixels_tab[0] = just_return;
+    avg_no_rnd_pixels_tab[1] = just_return;
+    avg_no_rnd_pixels_tab[2] = just_return;
+    avg_no_rnd_pixels_tab[3] = just_return;
+
+    //av_fdct = just_return;
+    //ff_idct = just_return;
+#endif
+}
diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h
new file mode 100644
index 0000000..87617e3
--- /dev/null
+++ b/libavcodec/x86/dsputil_mmx.h
@@ -0,0 +1,154 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2007  Aurelien Jacobs <aurel at gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DSPUTIL_MMX_H
+#define AVCODEC_X86_DSPUTIL_MMX_H
+
+#include <stdint.h>
+#include "libavcodec/dsputil.h"
+
+typedef struct { uint64_t a, b; } xmm_reg;
+
+extern const uint64_t ff_bone;
+extern const uint64_t ff_wtwo;
+
+extern const uint64_t ff_pdw_80000000[2];
+
+extern const uint64_t ff_pw_3;
+extern const uint64_t ff_pw_4;
+extern const xmm_reg  ff_pw_5;
+extern const xmm_reg  ff_pw_8;
+extern const uint64_t ff_pw_15;
+extern const xmm_reg  ff_pw_16;
+extern const uint64_t ff_pw_20;
+extern const xmm_reg  ff_pw_28;
+extern const xmm_reg  ff_pw_32;
+extern const uint64_t ff_pw_42;
+extern const uint64_t ff_pw_64;
+extern const uint64_t ff_pw_96;
+extern const uint64_t ff_pw_128;
+extern const uint64_t ff_pw_255;
+
+extern const uint64_t ff_pb_1;
+extern const uint64_t ff_pb_3;
+extern const uint64_t ff_pb_7;
+extern const uint64_t ff_pb_1F;
+extern const uint64_t ff_pb_3F;
+extern const uint64_t ff_pb_81;
+extern const uint64_t ff_pb_A1;
+extern const uint64_t ff_pb_FC;
+
+extern const double ff_pd_1[2];
+extern const double ff_pd_2[2];
+
+#define LOAD4(stride,in,a,b,c,d)\
+    "movq 0*"#stride"+"#in", "#a"\n\t"\
+    "movq 1*"#stride"+"#in", "#b"\n\t"\
+    "movq 2*"#stride"+"#in", "#c"\n\t"\
+    "movq 3*"#stride"+"#in", "#d"\n\t"
+
+#define STORE4(stride,out,a,b,c,d)\
+    "movq "#a", 0*"#stride"+"#out"\n\t"\
+    "movq "#b", 1*"#stride"+"#out"\n\t"\
+    "movq "#c", 2*"#stride"+"#out"\n\t"\
+    "movq "#d", 3*"#stride"+"#out"\n\t"
+
+/* in/out: mma=mma+mmb, mmb=mmb-mma */
+#define SUMSUB_BA( a, b ) \
+    "paddw "#b", "#a" \n\t"\
+    "paddw "#b", "#b" \n\t"\
+    "psubw "#a", "#b" \n\t"
+
+#define SBUTTERFLY(a,b,t,n,m)\
+    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
+    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
+    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
+
+#define TRANSPOSE4(a,b,c,d,t)\
+    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
+    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
+    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
+    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
+
+// e,f,g,h can be memory
+// out: a,d,t,c
+#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\
+    "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\
+    "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\
+    "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\
+    "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\
+    SBUTTERFLY(a, b, t, bw, q)   /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
+                                 /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\
+    SBUTTERFLY(c, d, b, bw, q)   /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
+                                 /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
+    SBUTTERFLY(a, c, d, wd, q)   /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
+                                 /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
+    SBUTTERFLY(t, b, c, wd, q)   /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\
+                                 /* c= a3 b3 c3 d3 e3 f3 g3 h3 */
+
+#ifdef ARCH_X86_64
+// permutes 01234567 -> 05736421
+#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
+    SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
+    SBUTTERFLY(c,d,b,wd,dqa)\
+    SBUTTERFLY(e,f,d,wd,dqa)\
+    SBUTTERFLY(g,h,f,wd,dqa)\
+    SBUTTERFLY(a,c,h,dq,dqa)\
+    SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
+    SBUTTERFLY(e,g,b,dq,dqa)\
+    SBUTTERFLY(d,f,g,dq,dqa)\
+    SBUTTERFLY(a,e,f,qdq,dqa)\
+    SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
+    SBUTTERFLY(h,b,d,qdq,dqa)\
+    SBUTTERFLY(c,g,b,qdq,dqa)\
+    "movdqa %%xmm8, "#g"              \n\t"
+#else
+#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
+    "movdqa "#h", "#t"                \n\t"\
+    SBUTTERFLY(a,b,h,wd,dqa)\
+    "movdqa "#h", 16"#t"              \n\t"\
+    "movdqa "#t", "#h"                \n\t"\
+    SBUTTERFLY(c,d,b,wd,dqa)\
+    SBUTTERFLY(e,f,d,wd,dqa)\
+    SBUTTERFLY(g,h,f,wd,dqa)\
+    SBUTTERFLY(a,c,h,dq,dqa)\
+    "movdqa "#h", "#t"                \n\t"\
+    "movdqa 16"#t", "#h"              \n\t"\
+    SBUTTERFLY(h,b,c,dq,dqa)\
+    SBUTTERFLY(e,g,b,dq,dqa)\
+    SBUTTERFLY(d,f,g,dq,dqa)\
+    SBUTTERFLY(a,e,f,qdq,dqa)\
+    SBUTTERFLY(h,d,e,qdq,dqa)\
+    "movdqa "#h", 16"#t"              \n\t"\
+    "movdqa "#t", "#h"                \n\t"\
+    SBUTTERFLY(h,b,d,qdq,dqa)\
+    SBUTTERFLY(c,g,b,qdq,dqa)\
+    "movdqa 16"#t", "#g"              \n\t"
+#endif
+
+#define MOVQ_WONE(regd) \
+    __asm__ volatile ( \
+    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+    "psrlw $15, %%" #regd ::)
+
+void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx);
+
+#endif /* AVCODEC_X86_DSPUTIL_MMX_H */
diff --git a/libavcodec/x86/dsputil_mmx_avg_template.c b/libavcodec/x86/dsputil_mmx_avg_template.c
new file mode 100644
index 0000000..616a12b
--- /dev/null
+++ b/libavcodec/x86/dsputil_mmx_avg_template.c
@@ -0,0 +1,896 @@
+/*
+ * DSP utils : average functions are compiled twice for 3dnow/mmx2
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer
+ *
+ * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni at gmx.at>
+ * and improved by Zdenek Kabelac <kabi at users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
+   clobber bug - now it will work with 2.95.2 and also with -fPIC
+ */
+static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "movd   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $4, %2                  \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movd   (%1), %%mm1             \n\t"
+        "movd   (%2), %%mm2             \n\t"
+        "movd   4(%2), %%mm3            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movd   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movd   (%1), %%mm1             \n\t"
+        "movd   8(%2), %%mm2            \n\t"
+        "movd   12(%2), %%mm3           \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movd   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $16, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+}
+
+
+static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $8, %2                  \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "pcmpeqb %%mm6, %%mm6           \n\t"
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $8, %2                  \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%2), %%mm2             \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "pxor %%mm6, %%mm2              \n\t"
+        "pxor %%mm6, %%mm3              \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   16(%2), %%mm2           \n\t"
+        "movq   24(%2), %%mm3           \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "pxor %%mm6, %%mm2              \n\t"
+        "pxor %%mm6, %%mm3              \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "movd   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $4, %2                  \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movd   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 4(%2), %%mm1             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        PAVGB" (%3), %%mm1              \n\t"
+        "movd   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movd   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movd   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 8(%2), %%mm0             \n\t"
+        PAVGB" 12(%2), %%mm1            \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movd   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        PAVGB" (%3), %%mm1              \n\t"
+        "movd   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $16, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+}
+
+
+static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $8, %2                  \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        PAVGB" (%3), %%mm1              \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        PAVGB" (%3), %%mm1              \n\t"
+        "movq   %%mm1, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq 8(%1), %%mm2              \n\t"
+        "movq 8(%1, %3), %%mm3          \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        PAVGB" 9(%1), %%mm2             \n\t"
+        PAVGB" 9(%1, %3), %%mm3         \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "movq %%mm2, 8(%2)              \n\t"
+        "movq %%mm3, 8(%2, %3)          \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq 8(%1), %%mm2              \n\t"
+        "movq 8(%1, %3), %%mm3          \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        PAVGB" 9(%1), %%mm2             \n\t"
+        PAVGB" 9(%1, %3), %%mm3         \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "movq %%mm2, 8(%2)              \n\t"
+        "movq %%mm3, 8(%2, %3)          \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $16, %2                 \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $16, %2                 \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        PAVGB" 8(%3), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" 8(%2), %%mm1             \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        PAVGB" 8(%3), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGB" 16(%2), %%mm0            \n\t"
+        PAVGB" 24(%2), %%mm1            \n\t"
+        PAVGB" (%3), %%mm0              \n\t"
+        PAVGB" 8(%3), %%mm1             \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    __asm__ volatile(
+        "pcmpeqb %%mm6, %%mm6           \n\t"
+        "testl $1, %0                   \n\t"
+            " jz 1f                     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "movq   (%2), %%mm2             \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "pxor %%mm6, %%mm2              \n\t"
+        "pxor %%mm6, %%mm3              \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $16, %2                 \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%2), %%mm2             \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "pxor %%mm6, %%mm2              \n\t"
+        "pxor %%mm6, %%mm3              \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   8(%1), %%mm1            \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   16(%2), %%mm2           \n\t"
+        "movq   24(%2), %%mm3           \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "pxor %%mm6, %%mm2              \n\t"
+        "pxor %%mm6, %%mm3              \n\t"
+        PAVGB" %%mm2, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm1             \n\t"
+        "pxor %%mm6, %%mm0              \n\t"
+        "pxor %%mm6, %%mm1              \n\t"
+        "movq   %%mm0, (%3)             \n\t"
+        "movq   %%mm1, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+//the following should be used, though better not with gcc ...
+/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+        :"r"(src1Stride), "r"(dstStride)
+        :"memory");*/
+}
+
+/* GL: this function does incorrect rounding if overflow */
+static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BONE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm2           \n\t"
+        "movq 1(%1), %%mm1              \n\t"
+        "movq 1(%1, %3), %%mm3          \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "psubusb %%mm6, %%mm0           \n\t"
+        "psubusb %%mm6, %%mm2           \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm2             \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq 1(%1), %%mm1              \n\t"
+        "movq (%1, %3), %%mm2           \n\t"
+        "movq 1(%1, %3), %%mm3          \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "psubusb %%mm6, %%mm0           \n\t"
+        "psubusb %%mm6, %%mm2           \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm3, %%mm2             \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "sub %3, %2                     \n\t"
+        "1:                             \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm2    \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm2, %%mm1             \n\t"
+        "movq %%mm0, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" %%mm1, %%mm2             \n\t"
+        PAVGB" %%mm0, %%mm1             \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D" (block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+/* GL: this function does incorrect rounding if overflow */
+static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BONE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "sub %3, %2                     \n\t"
+        "1:                             \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm2    \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "psubusb %%mm6, %%mm1           \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm2, %%mm1             \n\t"
+        "movq %%mm0, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "psubusb %%mm6, %%mm1           \n\t"
+        PAVGB" %%mm1, %%mm2             \n\t"
+        PAVGB" %%mm0, %%mm1             \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D" (block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%2), %%mm0               \n\t"
+        "movq (%2, %3), %%mm1           \n\t"
+        PAVGB" (%1), %%mm0              \n\t"
+        PAVGB" (%1, %3), %%mm1          \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "movq (%2), %%mm0               \n\t"
+        "movq (%2, %3), %%mm1           \n\t"
+        PAVGB" (%1), %%mm0              \n\t"
+        PAVGB" (%1, %3), %%mm1          \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "1:                             \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm2           \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm2         \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" (%2, %3), %%mm2          \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "movq (%1, %3), %%mm2           \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+        PAVGB" 1(%1, %3), %%mm2         \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" (%2, %3), %%mm2          \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        "sub %3, %2                     \n\t"
+        "1:                             \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm2    \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm2, %%mm1             \n\t"
+        "movq (%2, %3), %%mm3           \n\t"
+        "movq (%2, %%"REG_a"), %%mm4    \n\t"
+        PAVGB" %%mm3, %%mm0             \n\t"
+        PAVGB" %%mm4, %%mm1             \n\t"
+        "movq %%mm0, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
+        PAVGB" %%mm1, %%mm2             \n\t"
+        PAVGB" %%mm0, %%mm1             \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        "movq (%2, %3), %%mm3           \n\t"
+        "movq (%2, %%"REG_a"), %%mm4    \n\t"
+        PAVGB" %%mm3, %%mm2             \n\t"
+        PAVGB" %%mm4, %%mm1             \n\t"
+        "movq %%mm2, (%2, %3)           \n\t"
+        "movq %%mm1, (%2, %%"REG_a")    \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a, "memory");
+}
+
+/* Note this is not correctly rounded, but this function is only
+ * used for B-frames so it does not matter. */
+static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BONE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        PAVGB" 1(%1), %%mm0             \n\t"
+         ASMALIGN(3)
+        "1:                             \n\t"
+        "movq (%1, %%"REG_a"), %%mm2    \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "psubusb %%mm6, %%mm2           \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        PAVGB" 1(%1, %%"REG_a"), %%mm2  \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" %%mm1, %%mm0             \n\t"
+        PAVGB" %%mm2, %%mm1             \n\t"
+        PAVGB" (%2), %%mm0              \n\t"
+        PAVGB" (%2, %3), %%mm1          \n\t"
+        "movq %%mm0, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "movq (%1, %3), %%mm1           \n\t"
+        "movq (%1, %%"REG_a"), %%mm0    \n\t"
+        PAVGB" 1(%1, %3), %%mm1         \n\t"
+        PAVGB" 1(%1, %%"REG_a"), %%mm0  \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "add %%"REG_a", %1              \n\t"
+        PAVGB" %%mm1, %%mm2             \n\t"
+        PAVGB" %%mm0, %%mm1             \n\t"
+        PAVGB" (%2), %%mm2              \n\t"
+        PAVGB" (%2, %3), %%mm1          \n\t"
+        "movq %%mm2, (%2)               \n\t"
+        "movq %%mm1, (%2, %3)           \n\t"
+        "add %%"REG_a", %2              \n\t"
+        "subl $4, %0                    \n\t"
+        "jnz 1b                         \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r" ((x86_reg)line_size)
+        :"%"REG_a,  "memory");
+}
+
+static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    do {
+        __asm__ volatile(
+            "movd (%1), %%mm0               \n\t"
+            "movd (%1, %2), %%mm1           \n\t"
+            "movd (%1, %2, 2), %%mm2        \n\t"
+            "movd (%1, %3), %%mm3           \n\t"
+            PAVGB" (%0), %%mm0              \n\t"
+            PAVGB" (%0, %2), %%mm1          \n\t"
+            PAVGB" (%0, %2, 2), %%mm2       \n\t"
+            PAVGB" (%0, %3), %%mm3          \n\t"
+            "movd %%mm0, (%1)               \n\t"
+            "movd %%mm1, (%1, %2)           \n\t"
+            "movd %%mm2, (%1, %2, 2)        \n\t"
+            "movd %%mm3, (%1, %3)           \n\t"
+            ::"S"(pixels), "D"(block),
+             "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+            :"memory");
+        block += 4*line_size;
+        pixels += 4*line_size;
+        h -= 4;
+    } while(h > 0);
+}
+
+//FIXME the following could be optimized too ...
+static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
+    DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg_pixels8)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+#define QPEL_2TAP_L3(OPNAME) \
+static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
+    __asm__ volatile(\
+        "1:                    \n\t"\
+        "movq   (%1,%2), %%mm0 \n\t"\
+        "movq  8(%1,%2), %%mm1 \n\t"\
+        PAVGB"  (%1,%3), %%mm0 \n\t"\
+        PAVGB" 8(%1,%3), %%mm1 \n\t"\
+        PAVGB"  (%1),    %%mm0 \n\t"\
+        PAVGB" 8(%1),    %%mm1 \n\t"\
+        STORE_OP( (%1,%4),%%mm0)\
+        STORE_OP(8(%1,%4),%%mm1)\
+        "movq  %%mm0,  (%1,%4) \n\t"\
+        "movq  %%mm1, 8(%1,%4) \n\t"\
+        "add   %5, %1          \n\t"\
+        "decl  %0              \n\t"\
+        "jnz   1b              \n\t"\
+        :"+g"(h), "+r"(src)\
+        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
+         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
+        :"memory"\
+    );\
+}\
+static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
+    __asm__ volatile(\
+        "1:                    \n\t"\
+        "movq   (%1,%2), %%mm0 \n\t"\
+        PAVGB"  (%1,%3), %%mm0 \n\t"\
+        PAVGB"  (%1),    %%mm0 \n\t"\
+        STORE_OP((%1,%4),%%mm0)\
+        "movq  %%mm0,  (%1,%4) \n\t"\
+        "add   %5, %1          \n\t"\
+        "decl  %0              \n\t"\
+        "jnz   1b              \n\t"\
+        :"+g"(h), "+r"(src)\
+        :"r"((x86_reg)off1), "r"((x86_reg)off2),\
+         "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
+        :"memory"\
+    );\
+}
+
+#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
+QPEL_2TAP_L3(avg_)
+#undef STORE_OP
+#define STORE_OP(a,b)
+QPEL_2TAP_L3(put_)
+#undef STORE_OP
+#undef QPEL_2TAP_L3
diff --git a/libavcodec/x86/dsputil_mmx_qns_template.c b/libavcodec/x86/dsputil_mmx_qns_template.c
new file mode 100644
index 0000000..d2dbfc5
--- /dev/null
+++ b/libavcodec/x86/dsputil_mmx_qns_template.c
@@ -0,0 +1,101 @@
+/*
+ * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
+ * Copyright (c) 2004 Michael Niedermayer
+ *
+ * MMX optimization by Michael Niedermayer <michaelni at gmx.at>
+ * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng at gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
+
+static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
+{
+    x86_reg i=0;
+
+    assert(FFABS(scale) < MAX_ABS);
+    scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+
+    SET_RND(mm6);
+    __asm__ volatile(
+        "pxor %%mm7, %%mm7              \n\t"
+        "movd  %4, %%mm5                \n\t"
+        "punpcklwd %%mm5, %%mm5         \n\t"
+        "punpcklwd %%mm5, %%mm5         \n\t"
+        ASMALIGN(4)
+        "1:                             \n\t"
+        "movq  (%1, %0), %%mm0          \n\t"
+        "movq  8(%1, %0), %%mm1         \n\t"
+        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+        "paddw (%2, %0), %%mm0          \n\t"
+        "paddw 8(%2, %0), %%mm1         \n\t"
+        "psraw $6, %%mm0                \n\t"
+        "psraw $6, %%mm1                \n\t"
+        "pmullw (%3, %0), %%mm0         \n\t"
+        "pmullw 8(%3, %0), %%mm1        \n\t"
+        "pmaddwd %%mm0, %%mm0           \n\t"
+        "pmaddwd %%mm1, %%mm1           \n\t"
+        "paddd %%mm1, %%mm0             \n\t"
+        "psrld $4, %%mm0                \n\t"
+        "paddd %%mm0, %%mm7             \n\t"
+        "add $16, %0                    \n\t"
+        "cmp $128, %0                   \n\t" //FIXME optimize & bench
+        " jb 1b                         \n\t"
+        PHADDD(%%mm7, %%mm6)
+        "psrld $2, %%mm7                \n\t"
+        "movd %%mm7, %0                 \n\t"
+
+        : "+r" (i)
+        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
+    );
+    return i;
+}
+
+static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
+{
+    x86_reg i=0;
+
+    if(FFABS(scale) < MAX_ABS){
+        scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+        SET_RND(mm6);
+        __asm__ volatile(
+                "movd  %3, %%mm5        \n\t"
+                "punpcklwd %%mm5, %%mm5 \n\t"
+                "punpcklwd %%mm5, %%mm5 \n\t"
+                ASMALIGN(4)
+                "1:                     \n\t"
+                "movq  (%1, %0), %%mm0  \n\t"
+                "movq  8(%1, %0), %%mm1 \n\t"
+                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+                "paddw (%2, %0), %%mm0  \n\t"
+                "paddw 8(%2, %0), %%mm1 \n\t"
+                "movq %%mm0, (%2, %0)   \n\t"
+                "movq %%mm1, 8(%2, %0)  \n\t"
+                "add $16, %0            \n\t"
+                "cmp $128, %0           \n\t" // FIXME optimize & bench
+                " jb 1b                 \n\t"
+
+                : "+r" (i)
+                : "r"(basis), "r"(rem), "g"(scale)
+        );
+    }else{
+        for(i=0; i<8*8; i++){
+            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
+        }
+    }
+}
diff --git a/libavcodec/x86/dsputil_mmx_rnd_template.c b/libavcodec/x86/dsputil_mmx_rnd_template.c
new file mode 100644
index 0000000..45ed590
--- /dev/null
+++ b/libavcodec/x86/dsputil_mmx_rnd_template.c
@@ -0,0 +1,590 @@
+/*
+ * DSP utils mmx functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2003-2004 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni at gmx.at>
+ * and improved by Zdenek Kabelac <kabi at users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+// put_pixels
+static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"REG_a"     \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+        " jz 1f                         \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $8, %2                  \n\t"
+        PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
+        "movq   %%mm4, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm2             \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm5, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   16(%2), %%mm1           \n\t"
+        "add    %4, %1                  \n\t"
+        "movq   (%1), %%mm2             \n\t"
+        "movq   24(%2), %%mm3           \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $32, %2                 \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   %%mm5, (%3)             \n\t"
+        "add    %5, %3                  \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+}
+
+static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea        (%3, %3), %%"REG_a" \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "testl $1, %0                   \n\t"
+        " jz 1f                         \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "movq   8(%1), %%mm2            \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "add    %4, %1                  \n\t"
+        "add    $16, %2                 \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "movq   %%mm5, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "decl   %0                      \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "movq   8(%1), %%mm2            \n\t"
+        "movq   8(%2), %%mm3            \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "movq   %%mm5, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   16(%2), %%mm1           \n\t"
+        "movq   8(%1), %%mm2            \n\t"
+        "movq   24(%2), %%mm3           \n\t"
+        "add    %4, %1                  \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "movq   %%mm5, 8(%3)            \n\t"
+        "add    %5, %3                  \n\t"
+        "add    $32, %2                 \n\t"
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+        :"memory");
+}
+
+static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"),%%mm2   \n\t"
+        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"),%%mm0   \n\t"
+        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm4            \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "xor    %%"REG_a", %%"REG_a"    \n\t"
+        "add    %3, %1                  \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddusw %%mm2, %%mm0           \n\t"
+        "paddusw %%mm3, %%mm1           \n\t"
+        "paddusw %%mm6, %%mm4           \n\t"
+        "paddusw %%mm6, %%mm5           \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "psrlw  $2, %%mm4               \n\t"
+        "psrlw  $2, %%mm5               \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+        "movq   %%mm4, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"           \n\t"
+
+        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm2, %%mm4           \n\t"
+        "paddusw %%mm3, %%mm5           \n\t"
+        "paddusw %%mm6, %%mm0           \n\t"
+        "paddusw %%mm6, %%mm1           \n\t"
+        "paddusw %%mm4, %%mm0           \n\t"
+        "paddusw %%mm5, %%mm1           \n\t"
+        "psrlw  $2, %%mm0               \n\t"
+        "psrlw  $2, %%mm1               \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+        "movq   %%mm0, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"           \n\t"
+
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels)
+        :"D"(block), "r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+// avg_pixels
+static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+             "movd  %0, %%mm0           \n\t"
+             "movd  %1, %%mm1           \n\t"
+             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movd  %%mm2, %0           \n\t"
+             :"+m"(*block)
+             :"m"(*pixels)
+             :"memory");
+        pixels += line_size;
+        block += line_size;
+    }
+    while (--h);
+}
+
+// in case more speed is needed - unroling would certainly help
+static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+             "movq  %0, %%mm0           \n\t"
+             "movq  %1, %%mm1           \n\t"
+             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movq  %%mm2, %0           \n\t"
+             :"+m"(*block)
+             :"m"(*pixels)
+             :"memory");
+        pixels += line_size;
+        block += line_size;
+    }
+    while (--h);
+}
+
+static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+             "movq  %0, %%mm0           \n\t"
+             "movq  %1, %%mm1           \n\t"
+             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movq  %%mm2, %0           \n\t"
+             "movq  8%0, %%mm0          \n\t"
+             "movq  8%1, %%mm1          \n\t"
+             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movq  %%mm2, 8%0          \n\t"
+             :"+m"(*block)
+             :"m"(*pixels)
+             :"memory");
+        pixels += line_size;
+        block += line_size;
+    }
+    while (--h);
+}
+
+static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+            "movq  %1, %%mm0            \n\t"
+            "movq  1%1, %%mm1           \n\t"
+            "movq  %0, %%mm3            \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, %0            \n\t"
+            :"+m"(*block)
+            :"m"(*pixels)
+            :"memory");
+        pixels += line_size;
+        block += line_size;
+    } while (--h);
+}
+
+static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+            "movq  %1, %%mm0            \n\t"
+            "movq  %2, %%mm1            \n\t"
+            "movq  %0, %%mm3            \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, %0            \n\t"
+            :"+m"(*dst)
+            :"m"(*src1), "m"(*src2)
+            :"memory");
+        dst += dstStride;
+        src1 += src1Stride;
+        src2 += 8;
+    } while (--h);
+}
+
+static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+            "movq  %1, %%mm0            \n\t"
+            "movq  1%1, %%mm1           \n\t"
+            "movq  %0, %%mm3            \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, %0            \n\t"
+            "movq  8%1, %%mm0           \n\t"
+            "movq  9%1, %%mm1           \n\t"
+            "movq  8%0, %%mm3           \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, 8%0           \n\t"
+            :"+m"(*block)
+            :"m"(*pixels)
+            :"memory");
+        pixels += line_size;
+        block += line_size;
+    } while (--h);
+}
+
+static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+            "movq  %1, %%mm0            \n\t"
+            "movq  %2, %%mm1            \n\t"
+            "movq  %0, %%mm3            \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, %0            \n\t"
+            "movq  8%1, %%mm0           \n\t"
+            "movq  8%2, %%mm1           \n\t"
+            "movq  8%0, %%mm3           \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, 8%0           \n\t"
+            :"+m"(*dst)
+            :"m"(*src1), "m"(*src2)
+            :"memory");
+        dst += dstStride;
+        src1 += src1Stride;
+        src2 += 16;
+    } while (--h);
+}
+
+static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"REG_a"     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"), %%mm2  \n\t"
+        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm0, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
+        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm2, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+// this routine is 'slightly' suboptimal but mostly unused
+static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm4            \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "xor    %%"REG_a", %%"REG_a"    \n\t"
+        "add    %3, %1                  \n\t"
+        ASMALIGN(3)
+        "1:                             \n\t"
+        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddusw %%mm2, %%mm0           \n\t"
+        "paddusw %%mm3, %%mm1           \n\t"
+        "paddusw %%mm6, %%mm4           \n\t"
+        "paddusw %%mm6, %%mm5           \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "psrlw  $2, %%mm4               \n\t"
+        "psrlw  $2, %%mm5               \n\t"
+                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+                "pcmpeqd %%mm2, %%mm2   \n\t"
+                "paddb %%mm2, %%mm2     \n\t"
+                PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
+                "movq   %%mm5, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"                \n\t"
+
+        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm2, %%mm4           \n\t"
+        "paddusw %%mm3, %%mm5           \n\t"
+        "paddusw %%mm6, %%mm0           \n\t"
+        "paddusw %%mm6, %%mm1           \n\t"
+        "paddusw %%mm4, %%mm0           \n\t"
+        "paddusw %%mm5, %%mm1           \n\t"
+        "psrlw  $2, %%mm0               \n\t"
+        "psrlw  $2, %%mm1               \n\t"
+                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+                "pcmpeqd %%mm2, %%mm2   \n\t"
+                "paddb %%mm2, %%mm2     \n\t"
+                PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
+                "movq   %%mm1, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"           \n\t"
+
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels)
+        :"D"(block), "r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+//FIXME optimize
+static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
new file mode 100644
index 0000000..91165f2
--- /dev/null
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -0,0 +1,92 @@
+;******************************************************************************
+;* MMX optimized DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+
+section .text align=16
+
+%macro PSWAPD_SSE 2
+    pshufw %1, %2, 0x4e
+%endmacro
+%macro PSWAPD_3DN1 2
+    movq  %1, %2
+    psrlq %1, 32
+    punpckldq %1, %2
+%endmacro
+
+%macro FLOAT_TO_INT16_INTERLEAVE6 1
+; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
+cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
+%ifdef ARCH_X86_64
+    %define lend r10d
+    mov     lend, r2d
+%else
+    %define lend dword r2m
+%endif
+    mov src1q, [srcq+1*gprsize]
+    mov src2q, [srcq+2*gprsize]
+    mov src3q, [srcq+3*gprsize]
+    mov src4q, [srcq+4*gprsize]
+    mov src5q, [srcq+5*gprsize]
+    mov srcq,  [srcq]
+    sub src1q, srcq
+    sub src2q, srcq
+    sub src3q, srcq
+    sub src4q, srcq
+    sub src5q, srcq
+.loop:
+    cvtps2pi   mm0, [srcq]
+    cvtps2pi   mm1, [srcq+src1q]
+    cvtps2pi   mm2, [srcq+src2q]
+    cvtps2pi   mm3, [srcq+src3q]
+    cvtps2pi   mm4, [srcq+src4q]
+    cvtps2pi   mm5, [srcq+src5q]
+    packssdw   mm0, mm3
+    packssdw   mm1, mm4
+    packssdw   mm2, mm5
+    pswapd     mm3, mm0
+    punpcklwd  mm0, mm1
+    punpckhwd  mm1, mm2
+    punpcklwd  mm2, mm3
+    pswapd     mm3, mm0
+    punpckldq  mm0, mm2
+    punpckhdq  mm2, mm1
+    punpckldq  mm1, mm3
+    movq [dstq   ], mm0
+    movq [dstq+16], mm2
+    movq [dstq+ 8], mm1
+    add srcq, 8
+    add dstq, 24
+    sub lend, 2
+    jg .loop
+    emms
+    RET
+%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
+
+%define pswapd PSWAPD_SSE
+FLOAT_TO_INT16_INTERLEAVE6 sse
+%define cvtps2pi pf2id
+%define pswapd PSWAPD_3DN1
+FLOAT_TO_INT16_INTERLEAVE6 3dnow
+%undef pswapd
+FLOAT_TO_INT16_INTERLEAVE6 3dn2
+%undef cvtps2pi
+
diff --git a/libavcodec/i386/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
similarity index 100%
rename from libavcodec/i386/dsputilenc_mmx.c
rename to libavcodec/x86/dsputilenc_mmx.c
diff --git a/libavcodec/i386/fdct_mmx.c b/libavcodec/x86/fdct_mmx.c
similarity index 100%
rename from libavcodec/i386/fdct_mmx.c
rename to libavcodec/x86/fdct_mmx.c
diff --git a/libavcodec/i386/fft_3dn.c b/libavcodec/x86/fft_3dn.c
similarity index 100%
rename from libavcodec/i386/fft_3dn.c
rename to libavcodec/x86/fft_3dn.c
diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/x86/fft_3dn2.c
similarity index 100%
rename from libavcodec/i386/fft_3dn2.c
rename to libavcodec/x86/fft_3dn2.c
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
new file mode 100644
index 0000000..3971867
--- /dev/null
+++ b/libavcodec/x86/fft_mmx.asm
@@ -0,0 +1,467 @@
+;******************************************************************************
+;* FFT transform with SSE/3DNow optimizations
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+; These functions are not individually interchangeable with the C versions.
+; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
+; in blocks as conventient to the vector size.
+; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+
+%define M_SQRT1_2 0.70710678118654752440
+ps_root2: times 4 dd M_SQRT1_2
+ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+ps_m1p1: dd 1<<31, 0
+
+%assign i 16
+%rep 13
+cextern ff_cos_ %+ i
+%assign i i<<1
+%endrep
+
+%ifdef ARCH_X86_64
+    %define pointer dq
+%else
+    %define pointer dd
+%endif
+
+%macro IF0 1+
+%endmacro
+%macro IF1 1+
+    %1
+%endmacro
+
+section .text align=16
+
+%macro T2_3DN 4 ; z0, z1, mem0, mem1
+    mova     %1, %3
+    mova     %2, %1
+    pfadd    %1, %4
+    pfsub    %2, %4
+%endmacro
+
+%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
+    mova     %5, %3
+    pfsub    %3, %4
+    pfadd    %5, %4 ; {t6,t5}
+    pxor     %3, [ps_m1p1 GLOBAL] ; {t8,t7}
+    mova     %6, %1
+    pswapd   %3, %3
+    pfadd    %1, %5 ; {r0,i0}
+    pfsub    %6, %5 ; {r2,i2}
+    mova     %4, %2
+    pfadd    %2, %3 ; {r1,i1}
+    pfsub    %4, %3 ; {r3,i3}
+    SWAP     %3, %6
+%endmacro
+
+; in:  %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
+; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
+%macro T4_SSE 3
+    mova     %3, %1
+    shufps   %1, %2, 0x64 ; {r0,i0,r3,i2}
+    shufps   %3, %2, 0xce ; {r1,i1,r2,i3}
+    mova     %2, %1
+    addps    %1, %3       ; {t1,t2,t6,t5}
+    subps    %2, %3       ; {t3,t4,t8,t7}
+    mova     %3, %1
+    shufps   %1, %2, 0x44 ; {t1,t2,t3,t4}
+    shufps   %3, %2, 0xbe ; {t6,t5,t7,t8}
+    mova     %2, %1
+    addps    %1, %3       ; {r0,i0,r1,i1}
+    subps    %2, %3       ; {r2,i2,r3,i3}
+    mova     %3, %1
+    shufps   %1, %2, 0x88 ; {r0,r1,r2,r3}
+    shufps   %3, %2, 0xdd ; {i0,i1,i2,i3}
+    SWAP     %2, %3
+%endmacro
+
+%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
+    mova     %5, %3
+    shufps   %3, %4, 0x44 ; {r4,i4,r6,i6}
+    shufps   %5, %4, 0xee ; {r5,i5,r7,i7}
+    mova     %6, %3
+    subps    %3, %5       ; {r5,i5,r7,i7}
+    addps    %6, %5       ; {t1,t2,t3,t4}
+    mova     %5, %3
+    shufps   %5, %5, 0xb1 ; {i5,r5,i7,r7}
+    mulps    %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7}
+    mulps    %5, [ps_root2 GLOBAL]
+    addps    %3, %5       ; {t8,t7,ta,t9}
+    mova     %5, %6
+    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
+    shufps   %5, %3, 0x9c ; {t1,t4,t7,ta}
+    mova     %3, %6
+    addps    %6, %5       ; {t1,t2,t9,ta}
+    subps    %3, %5       ; {t6,t5,tc,tb}
+    mova     %5, %6
+    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
+    shufps   %5, %3, 0x8d ; {t2,ta,t6,tc}
+    mova     %3, %1
+    mova     %4, %2
+    addps    %1, %6       ; {r0,r1,r2,r3}
+    addps    %2, %5       ; {i0,i1,i2,i3}
+    subps    %3, %6       ; {r4,r5,r6,r7}
+    subps    %4, %5       ; {i4,i5,i6,i7}
+%endmacro
+
+; scheduled for cpu-bound sizes
+%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
+IF%1 mova    m4, Z(4)
+IF%1 mova    m5, Z(5)
+    mova     m0, %2 ; wre
+    mova     m2, m4
+    mova     m1, %3 ; wim
+    mova     m3, m5
+    mulps    m2, m0 ; r2*wre
+IF%1 mova    m6, Z(6)
+    mulps    m3, m1 ; i2*wim
+IF%1 mova    m7, Z(7)
+    mulps    m4, m1 ; r2*wim
+    mulps    m5, m0 ; i2*wre
+    addps    m2, m3 ; r2*wre + i2*wim
+    mova     m3, m1
+    mulps    m1, m6 ; r3*wim
+    subps    m5, m4 ; i2*wre - r2*wim
+    mova     m4, m0
+    mulps    m3, m7 ; i3*wim
+    mulps    m4, m6 ; r3*wre
+    mulps    m0, m7 ; i3*wre
+    subps    m4, m3 ; r3*wre - i3*wim
+    mova     m3, Z(0)
+    addps    m0, m1 ; i3*wre + r3*wim
+    mova     m1, m4
+    addps    m4, m2 ; t5
+    subps    m1, m2 ; t3
+    subps    m3, m4 ; r2
+    addps    m4, Z(0) ; r0
+    mova     m6, Z(2)
+    mova   Z(4), m3
+    mova   Z(0), m4
+    mova     m3, m5
+    subps    m5, m0 ; t4
+    mova     m4, m6
+    subps    m6, m5 ; r3
+    addps    m5, m4 ; r1
+    mova   Z(6), m6
+    mova   Z(2), m5
+    mova     m2, Z(3)
+    addps    m3, m0 ; t6
+    subps    m2, m1 ; i3
+    mova     m7, Z(1)
+    addps    m1, Z(3) ; i1
+    mova   Z(7), m2
+    mova   Z(3), m1
+    mova     m4, m7
+    subps    m7, m3 ; i2
+    addps    m3, m4 ; i0
+    mova   Z(5), m7
+    mova   Z(1), m3
+%endmacro
+
+; scheduled to avoid store->load aliasing
+%macro PASS_BIG 1 ; (!interleave)
+    mova     m4, Z(4) ; r2
+    mova     m5, Z(5) ; i2
+    mova     m2, m4
+    mova     m0, [wq] ; wre
+    mova     m3, m5
+    mova     m1, [wq+o1q] ; wim
+    mulps    m2, m0 ; r2*wre
+    mova     m6, Z(6) ; r3
+    mulps    m3, m1 ; i2*wim
+    mova     m7, Z(7) ; i3
+    mulps    m4, m1 ; r2*wim
+    mulps    m5, m0 ; i2*wre
+    addps    m2, m3 ; r2*wre + i2*wim
+    mova     m3, m1
+    mulps    m1, m6 ; r3*wim
+    subps    m5, m4 ; i2*wre - r2*wim
+    mova     m4, m0
+    mulps    m3, m7 ; i3*wim
+    mulps    m4, m6 ; r3*wre
+    mulps    m0, m7 ; i3*wre
+    subps    m4, m3 ; r3*wre - i3*wim
+    mova     m3, Z(0)
+    addps    m0, m1 ; i3*wre + r3*wim
+    mova     m1, m4
+    addps    m4, m2 ; t5
+    subps    m1, m2 ; t3
+    subps    m3, m4 ; r2
+    addps    m4, Z(0) ; r0
+    mova     m6, Z(2)
+    mova   Z(4), m3
+    mova   Z(0), m4
+    mova     m3, m5
+    subps    m5, m0 ; t4
+    mova     m4, m6
+    subps    m6, m5 ; r3
+    addps    m5, m4 ; r1
+IF%1 mova  Z(6), m6
+IF%1 mova  Z(2), m5
+    mova     m2, Z(3)
+    addps    m3, m0 ; t6
+    subps    m2, m1 ; i3
+    mova     m7, Z(1)
+    addps    m1, Z(3) ; i1
+IF%1 mova  Z(7), m2
+IF%1 mova  Z(3), m1
+    mova     m4, m7
+    subps    m7, m3 ; i2
+    addps    m3, m4 ; i0
+IF%1 mova  Z(5), m7
+IF%1 mova  Z(1), m3
+%if %1==0
+    mova     m4, m5 ; r1
+    mova     m0, m6 ; r3
+    unpcklps m5, m1
+    unpckhps m4, m1
+    unpcklps m6, m2
+    unpckhps m0, m2
+    mova     m1, Z(0)
+    mova     m2, Z(4)
+    mova   Z(2), m5
+    mova   Z(3), m4
+    mova   Z(6), m6
+    mova   Z(7), m0
+    mova     m5, m1 ; r0
+    mova     m4, m2 ; r2
+    unpcklps m1, m3
+    unpckhps m5, m3
+    unpcklps m2, m7
+    unpckhps m4, m7
+    mova   Z(0), m1
+    mova   Z(1), m5
+    mova   Z(4), m2
+    mova   Z(5), m4
+%endif
+%endmacro
+
+%macro PUNPCK 3
+    mova      %3, %1
+    punpckldq %1, %2
+    punpckhdq %3, %2
+%endmacro
+
+INIT_XMM
+
+%define Z(x) [r0+mmsize*x]
+
+align 16
+fft4_sse:
+    mova     m0, Z(0)
+    mova     m1, Z(1)
+    T4_SSE   m0, m1, m2
+    mova   Z(0), m0
+    mova   Z(1), m1
+    ret
+
+align 16
+fft8_sse:
+    mova     m0, Z(0)
+    mova     m1, Z(1)
+    T4_SSE   m0, m1, m2
+    mova     m2, Z(2)
+    mova     m3, Z(3)
+    T8_SSE   m0, m1, m2, m3, m4, m5
+    mova   Z(0), m0
+    mova   Z(1), m1
+    mova   Z(2), m2
+    mova   Z(3), m3
+    ret
+
+align 16
+fft16_sse:
+    mova     m0, Z(0)
+    mova     m1, Z(1)
+    T4_SSE   m0, m1, m2
+    mova     m2, Z(2)
+    mova     m3, Z(3)
+    T8_SSE   m0, m1, m2, m3, m4, m5
+    mova     m4, Z(4)
+    mova     m5, Z(5)
+    mova   Z(0), m0
+    mova   Z(1), m1
+    mova   Z(2), m2
+    mova   Z(3), m3
+    T4_SSE   m4, m5, m6
+    mova     m6, Z(6)
+    mova     m7, Z(7)
+    T4_SSE   m6, m7, m0
+    PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL]
+    ret
+
+
+INIT_MMX
+
+%macro FFT48_3DN 1
+align 16
+fft4%1:
+    T2_3DN   m0, m1, Z(0), Z(1)
+    mova     m2, Z(2)
+    mova     m3, Z(3)
+    T4_3DN   m0, m1, m2, m3, m4, m5
+    PUNPCK   m0, m1, m4
+    PUNPCK   m2, m3, m5
+    mova   Z(0), m0
+    mova   Z(1), m4
+    mova   Z(2), m2
+    mova   Z(3), m5
+    ret
+
+align 16
+fft8%1:
+    T2_3DN   m0, m1, Z(0), Z(1)
+    mova     m2, Z(2)
+    mova     m3, Z(3)
+    T4_3DN   m0, m1, m2, m3, m4, m5
+    mova   Z(0), m0
+    mova   Z(2), m2
+    T2_3DN   m4, m5, Z(4), Z(5)
+    T2_3DN   m6, m7, Z(6), Z(7)
+    pswapd   m0, m5
+    pswapd   m2, m7
+    pxor     m0, [ps_m1p1 GLOBAL]
+    pxor     m2, [ps_m1p1 GLOBAL]
+    pfsub    m5, m0
+    pfadd    m7, m2
+    pfmul    m5, [ps_root2 GLOBAL]
+    pfmul    m7, [ps_root2 GLOBAL]
+    T4_3DN   m1, m3, m5, m7, m0, m2
+    mova   Z(5), m5
+    mova   Z(7), m7
+    mova     m0, Z(0)
+    mova     m2, Z(2)
+    T4_3DN   m0, m2, m4, m6, m5, m7
+    PUNPCK   m0, m1, m5
+    PUNPCK   m2, m3, m7
+    mova   Z(0), m0
+    mova   Z(1), m5
+    mova   Z(2), m2
+    mova   Z(3), m7
+    PUNPCK   m4, Z(5), m5
+    PUNPCK   m6, Z(7), m7
+    mova   Z(4), m4
+    mova   Z(5), m5
+    mova   Z(6), m6
+    mova   Z(7), m7
+    ret
+%endmacro
+
+FFT48_3DN _3dn2
+
+%macro pswapd 2
+%ifidn %1, %2
+    movd [r0+12], %1
+    punpckhdq %1, [r0+8]
+%else
+    movq  %1, %2
+    psrlq %1, 32
+    punpckldq %1, %2
+%endif
+%endmacro
+
+FFT48_3DN _3dn
+
+
+%define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
+
+%macro DECL_PASS 2+ ; name, payload
+align 16
+%1:
+DEFINE_ARGS z, w, n, o1, o3
+    lea o3q, [nq*3]
+    lea o1q, [nq*8]
+    shl o3q, 4
+.loop:
+    %2
+    add zq, mmsize*2
+    add wq, mmsize
+    sub nd, mmsize/8
+    jg .loop
+    rep ret
+%endmacro
+
+INIT_XMM
+DECL_PASS pass_sse, PASS_BIG 1
+DECL_PASS pass_interleave_sse, PASS_BIG 0
+
+INIT_MMX
+%define mulps pfmul
+%define addps pfadd
+%define subps pfsub
+%define unpcklps punpckldq
+%define unpckhps punpckhdq
+DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
+DECL_PASS pass_interleave_3dn, PASS_BIG 0
+%define pass_3dn2 pass_3dn
+%define pass_interleave_3dn2 pass_interleave_3dn
+
+
+%macro DECL_FFT 2-3 ; nbits, cpu, suffix
+%xdefine list_of_fft fft4%2, fft8%2
+%if %1==5
+%xdefine list_of_fft list_of_fft, fft16%2
+%endif
+
+%assign n 1<<%1
+%rep 17-%1
+%assign n2 n/2
+%assign n4 n/4
+%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2
+
+align 16
+fft %+ n %+ %3%2:
+    call fft %+ n2 %+ %2
+    add r0, n*4 - (n&(-2<<%1))
+    call fft %+ n4 %+ %2
+    add r0, n*2 - (n2&(-2<<%1))
+    call fft %+ n4 %+ %2
+    sub r0, n*6 + (n2&(-2<<%1))
+    lea r1, [ff_cos_ %+ n GLOBAL]
+    mov r2d, n4/2
+    jmp pass%3%2
+
+%assign n n*2
+%endrep
+%undef n
+
+align 8
+dispatch_tab%3%2: pointer list_of_fft
+
+; On x86_32, this function does the register saving and restoring for all of fft.
+; The others pass args in registers and don't spill anything.
+cglobal fft_dispatch%3%2, 2,5,0, z, nbits
+    lea r2, [dispatch_tab%3%2 GLOBAL]
+    mov r2, [r2 + (nbitsq-2)*gprsize]
+    call r2
+    RET
+%endmacro ; DECL_FFT
+
+DECL_FFT 5, _sse
+DECL_FFT 5, _sse, _interleave
+DECL_FFT 4, _3dn
+DECL_FFT 4, _3dn, _interleave
+DECL_FFT 4, _3dn2
+DECL_FFT 4, _3dn2, _interleave
+
diff --git a/libavcodec/i386/fft_sse.c b/libavcodec/x86/fft_sse.c
similarity index 100%
rename from libavcodec/i386/fft_sse.c
rename to libavcodec/x86/fft_sse.c
diff --git a/libavcodec/i386/flacdsp_mmx.c b/libavcodec/x86/flacdsp_mmx.c
similarity index 100%
rename from libavcodec/i386/flacdsp_mmx.c
rename to libavcodec/x86/flacdsp_mmx.c
diff --git a/libavcodec/x86/h264_deblock_sse2.asm b/libavcodec/x86/h264_deblock_sse2.asm
new file mode 100644
index 0000000..d59de91
--- /dev/null
+++ b/libavcodec/x86/h264_deblock_sse2.asm
@@ -0,0 +1,747 @@
+;*****************************************************************************
+;* deblock-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005-2008 x264 project
+;*
+;* Authors: Loren Merritt <lorenm at u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+pb_00: times 16 db 0x00
+pb_01: times 16 db 0x01
+pb_03: times 16 db 0x03
+pb_a1: times 16 db 0xa1
+
+SECTION .text
+
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+    [base], [base+stride], [base+stride*2], [base3], \
+    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
+
+; in: 8 rows of 4 bytes in %1..%8
+; out: 4 rows of 8 bytes in m0..m3
+%macro TRANSPOSE4x8_LOAD 8
+    movd       m0, %1
+    movd       m2, %2
+    movd       m1, %3
+    movd       m3, %4
+    punpcklbw  m0, m2
+    punpcklbw  m1, m3
+    movq       m2, m0
+    punpcklwd  m0, m1
+    punpckhwd  m2, m1
+
+    movd       m4, %5
+    movd       m6, %6
+    movd       m5, %7
+    movd       m7, %8
+    punpcklbw  m4, m6
+    punpcklbw  m5, m7
+    movq       m6, m4
+    punpcklwd  m4, m5
+    punpckhwd  m6, m5
+
+    movq       m1, m0
+    movq       m3, m2
+    punpckldq  m0, m4
+    punpckhdq  m1, m4
+    punpckldq  m2, m6
+    punpckhdq  m3, m6
+%endmacro
+
+; in: 4 rows of 8 bytes in m0..m3
+; out: 8 rows of 4 bytes in %1..%8
+%macro TRANSPOSE8x4_STORE 8
+    movq       m4, m0
+    movq       m5, m1
+    movq       m6, m2
+    punpckhdq  m4, m4
+    punpckhdq  m5, m5
+    punpckhdq  m6, m6
+
+    punpcklbw  m0, m1
+    punpcklbw  m2, m3
+    movq       m1, m0
+    punpcklwd  m0, m2
+    punpckhwd  m1, m2
+    movd       %1, m0
+    punpckhdq  m0, m0
+    movd       %2, m0
+    movd       %3, m1
+    punpckhdq  m1, m1
+    movd       %4, m1
+
+    punpckhdq  m3, m3
+    punpcklbw  m4, m5
+    punpcklbw  m6, m3
+    movq       m5, m4
+    punpcklwd  m4, m6
+    punpckhwd  m5, m6
+    movd       %5, m4
+    punpckhdq  m4, m4
+    movd       %6, m4
+    movd       %7, m5
+    punpckhdq  m5, m5
+    movd       %8, m5
+%endmacro
+
+%macro SBUTTERFLY 4
+    movq       %4, %2
+    punpckl%1  %2, %3
+    punpckh%1  %4, %3
+%endmacro
+
+; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
+; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
+%macro TRANSPOSE6x8_MEM 9
+    movq  m0, %1
+    movq  m1, %2
+    movq  m2, %3
+    movq  m3, %4
+    movq  m4, %5
+    movq  m5, %6
+    movq  m6, %7
+    SBUTTERFLY bw, m0, m1, m7
+    SBUTTERFLY bw, m2, m3, m1
+    SBUTTERFLY bw, m4, m5, m3
+    movq  [%9+0x10], m1
+    SBUTTERFLY bw, m6, %8, m5
+    SBUTTERFLY wd, m0, m2, m1
+    SBUTTERFLY wd, m4, m6, m2
+    punpckhdq m0, m4
+    movq  [%9+0x00], m0
+    SBUTTERFLY wd, m7, [%9+0x10], m6
+    SBUTTERFLY wd, m3, m5, m4
+    SBUTTERFLY dq, m7, m3, m0
+    SBUTTERFLY dq, m1, m2, m5
+    punpckldq m6, m4
+    movq  [%9+0x10], m1
+    movq  [%9+0x20], m5
+    movq  [%9+0x30], m7
+    movq  [%9+0x40], m0
+    movq  [%9+0x50], m6
+%endmacro
+
+; in: 8 rows of 8 in %1..%8
+; out: 8 rows of 8 in %9..%16
+%macro TRANSPOSE8x8_MEM 16
+    movq  m0, %1
+    movq  m1, %2
+    movq  m2, %3
+    movq  m3, %4
+    movq  m4, %5
+    movq  m5, %6
+    movq  m6, %7
+    SBUTTERFLY bw, m0, m1, m7
+    SBUTTERFLY bw, m2, m3, m1
+    SBUTTERFLY bw, m4, m5, m3
+    SBUTTERFLY bw, m6, %8, m5
+    movq  %9,  m3
+    SBUTTERFLY wd, m0, m2, m3
+    SBUTTERFLY wd, m4, m6, m2
+    SBUTTERFLY wd, m7, m1, m6
+    movq  %11, m2
+    movq  m2,  %9
+    SBUTTERFLY wd, m2, m5, m1
+    SBUTTERFLY dq, m0, m4, m5
+    SBUTTERFLY dq, m7, m2, m4
+    movq  %9,  m0
+    movq  %10, m5
+    movq  %13, m7
+    movq  %14, m4
+    SBUTTERFLY dq, m3, %11, m0
+    SBUTTERFLY dq, m6, m1, m5
+    movq  %11, m3
+    movq  %12, m0
+    movq  %15, m6
+    movq  %16, m5
+%endmacro
+
+; out: %4 = |%1-%2|>%3
+; clobbers: %5
+%macro DIFF_GT 5
+    mova    %5, %2
+    mova    %4, %1
+    psubusb %5, %1
+    psubusb %4, %2
+    por     %4, %5
+    psubusb %4, %3
+%endmacro
+
+; out: %4 = |%1-%2|>%3
+; clobbers: %5
+%macro DIFF_GT2 5
+    mova    %5, %2
+    mova    %4, %1
+    psubusb %5, %1
+    psubusb %4, %2
+    psubusb %5, %3
+    psubusb %4, %3
+    pcmpeqb %4, %5
+%endmacro
+
+%macro SPLATW 1
+%ifidn m0, xmm0
+    pshuflw  %1, %1, 0
+    punpcklqdq %1, %1
+%else
+    pshufw   %1, %1, 0
+%endif
+%endmacro
+
+; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
+; out: m5=beta-1, m7=mask, %3=alpha-1
+; clobbers: m4,m6
+%macro LOAD_MASK 2-3
+    movd     m4, %1
+    movd     m5, %2
+    SPLATW   m4
+    SPLATW   m5
+    packuswb m4, m4  ; 16x alpha-1
+    packuswb m5, m5  ; 16x beta-1
+%if %0>2
+    mova     %3, m4
+%endif
+    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
+    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
+    por      m7, m4
+    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
+    por      m7, m4
+    pxor     m6, m6
+    pcmpeqb  m7, m6
+%endmacro
+
+; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
+; out: m1=p0' m2=q0'
+; clobbers: m0,3-6
+%macro DEBLOCK_P0_Q0 0
+    mova    m5, m1
+    pxor    m5, m2           ; p0^q0
+    pand    m5, [pb_01 GLOBAL] ; (p0^q0)&1
+    pcmpeqb m4, m4
+    pxor    m3, m4
+    pavgb   m3, m0           ; (p1 - q1 + 256)>>1
+    pavgb   m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
+    pxor    m4, m1
+    pavgb   m4, m2           ; (q0 - p0 + 256)>>1
+    pavgb   m3, m5
+    paddusb m3, m4           ; d+128+33
+    mova    m6, [pb_a1 GLOBAL]
+    psubusb m6, m3
+    psubusb m3, [pb_a1 GLOBAL]
+    pminub  m6, m7
+    pminub  m3, m7
+    psubusb m1, m6
+    psubusb m2, m3
+    paddusb m1, m3
+    paddusb m2, m6
+%endmacro
+
+; in: m1=p0 m2=q0
+;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
+; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
+; clobbers: q2, tmp, tc0
+%macro LUMA_Q1 6
+    mova    %6, m1
+    pavgb   %6, m2
+    pavgb   %2, %6             ; avg(p2,avg(p0,q0))
+    pxor    %6, %3
+    pand    %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
+    psubusb %2, %6             ; (p2+((p0+q0+1)>>1))>>1
+    mova    %6, %1
+    psubusb %6, %5
+    paddusb %5, %1
+    pmaxub  %2, %6
+    pminub  %2, %5
+    mova    %4, %2
+%endmacro
+
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+INIT_XMM
+cglobal x264_deblock_v_luma_sse2
+    movd    m8, [r4] ; tc0
+    lea     r4, [r1*3]
+    dec     r2d        ; alpha-1
+    neg     r4
+    dec     r3d        ; beta-1
+    add     r4, r0     ; pix-3*stride
+
+    mova    m0, [r4+r1]   ; p1
+    mova    m1, [r4+2*r1] ; p0
+    mova    m2, [r0]      ; q0
+    mova    m3, [r0+r1]   ; q1
+    LOAD_MASK r2d, r3d
+
+    punpcklbw m8, m8
+    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
+    pcmpeqb m9, m9
+    pcmpeqb m9, m8
+    pandn   m9, m7
+    pand    m8, m9
+
+    movdqa  m3, [r4] ; p2
+    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+    pand    m6, m9
+    mova    m7, m8
+    psubb   m7, m6
+    pand    m6, m8
+    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+
+    movdqa  m4, [r0+2*r1] ; q2
+    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+    pand    m6, m9
+    pand    m8, m6
+    psubb   m7, m6
+    mova    m3, [r0+r1]
+    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
+
+    DEBLOCK_P0_Q0
+    mova    [r4+2*r1], m1
+    mova    [r0], m2
+    ret
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_deblock_h_luma_sse2
+    movsxd r10, esi
+    lea    r11, [r10+r10*2]
+    lea    rax, [r0-4]
+    lea    r9,  [r0-4+r11]
+    sub    rsp, 0x68
+    %define pix_tmp rsp
+
+    ; transpose 6x16 -> tmp space
+    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp
+    lea    rax, [rax+r10*8]
+    lea    r9,  [r9 +r10*8]
+    TRANSPOSE6x8_MEM  PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
+
+    ; vertical filter
+    ; alpha, beta, tc0 are still in r2d, r3d, r4
+    ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+    lea    r0, [pix_tmp+0x30]
+    mov    esi, 0x10
+    call   x264_deblock_v_luma_sse2
+
+    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
+    add    rax, 2
+    add    r9,  2
+    movq   m0, [pix_tmp+0x18]
+    movq   m1, [pix_tmp+0x28]
+    movq   m2, [pix_tmp+0x38]
+    movq   m3, [pix_tmp+0x48]
+    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
+
+    shl    r10, 3
+    sub    rax, r10
+    sub    r9,  r10
+    shr    r10, 3
+    movq   m0, [pix_tmp+0x10]
+    movq   m1, [pix_tmp+0x20]
+    movq   m2, [pix_tmp+0x30]
+    movq   m3, [pix_tmp+0x40]
+    TRANSPOSE8x4_STORE  PASS8ROWS(rax, r9, r10, r11)
+
+    add    rsp, 0x68
+    ret
+
+%else
+
+%macro DEBLOCK_LUMA 3
+;-----------------------------------------------------------------------------
+; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_%2_luma_%1, 5,5
+    lea     r4, [r1*3]
+    dec     r2     ; alpha-1
+    neg     r4
+    dec     r3     ; beta-1
+    add     r4, r0 ; pix-3*stride
+    %assign pad 2*%3+12-(stack_offset&15)
+    SUB     esp, pad
+
+    mova    m0, [r4+r1]   ; p1
+    mova    m1, [r4+2*r1] ; p0
+    mova    m2, [r0]      ; q0
+    mova    m3, [r0+r1]   ; q1
+    LOAD_MASK r2, r3
+
+    mov     r3, r4m
+    movd    m4, [r3] ; tc0
+    punpcklbw m4, m4
+    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
+    mova   [esp+%3], m4 ; tc
+    pcmpeqb m3, m3
+    pcmpgtb m4, m3
+    pand    m4, m7
+    mova   [esp], m4 ; mask
+
+    mova    m3, [r4] ; p2
+    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+    pand    m6, m4
+    pand    m4, [esp+%3] ; tc
+    mova    m7, m4
+    psubb   m7, m6
+    pand    m6, m4
+    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+
+    mova    m4, [r0+2*r1] ; q2
+    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+    mova    m5, [esp] ; mask
+    pand    m6, m5
+    mova    m5, [esp+%3] ; tc
+    pand    m5, m6
+    psubb   m7, m6
+    mova    m3, [r0+r1]
+    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
+
+    DEBLOCK_P0_Q0
+    mova    [r4+2*r1], m1
+    mova    [r0], m2
+    ADD     esp, pad
+    RET
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_deblock_h_luma_%1, 0,5
+    mov    r0, r0m
+    mov    r3, r1m
+    lea    r4, [r3*3]
+    sub    r0, 4
+    lea    r1, [r0+r4]
+    %assign pad 0x78-(stack_offset&15)
+    SUB    esp, pad
+%define pix_tmp esp+12
+
+    ; transpose 6x16 -> tmp space
+    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
+    lea    r0, [r0+r3*8]
+    lea    r1, [r1+r3*8]
+    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
+
+    ; vertical filter
+    lea    r0, [pix_tmp+0x30]
+    PUSH   dword r4m
+    PUSH   dword r3m
+    PUSH   dword r2m
+    PUSH   dword 16
+    PUSH   dword r0
+    call   x264_deblock_%2_luma_%1
+%ifidn %2, v8
+    add    dword [esp   ], 8 ; pix_tmp+0x38
+    add    dword [esp+16], 2 ; tc0+2
+    call   x264_deblock_%2_luma_%1
+%endif
+    ADD    esp, 20
+
+    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
+    mov    r0, r0m
+    sub    r0, 2
+    lea    r1, [r0+r4]
+
+    movq   m0, [pix_tmp+0x10]
+    movq   m1, [pix_tmp+0x20]
+    movq   m2, [pix_tmp+0x30]
+    movq   m3, [pix_tmp+0x40]
+    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
+
+    lea    r0, [r0+r3*8]
+    lea    r1, [r1+r3*8]
+    movq   m0, [pix_tmp+0x18]
+    movq   m1, [pix_tmp+0x28]
+    movq   m2, [pix_tmp+0x38]
+    movq   m3, [pix_tmp+0x48]
+    TRANSPOSE8x4_STORE  PASS8ROWS(r0, r1, r3, r4)
+
+    ADD    esp, pad
+    RET
+%endmacro ; DEBLOCK_LUMA
+
+INIT_XMM
+DEBLOCK_LUMA sse2, v, 16
+
+%endif ; ARCH
+
+
+
+%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
+    mova  t0, p2
+    mova  t1, p0
+    pavgb t0, p1
+    pavgb t1, q0
+    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
+    mova  t5, t1
+    mova  t2, p2
+    mova  t3, p0
+    paddb t2, p1
+    paddb t3, q0
+    paddb t2, t3
+    mova  t3, t2
+    mova  t4, t2
+    psrlw t2, 1
+    pavgb t2, mpb_00
+    pxor  t2, t0
+    pand  t2, mpb_01
+    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
+
+    mova  t1, p2
+    mova  t2, p2
+    pavgb t1, q1
+    psubb t2, q1
+    paddb t3, t3
+    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
+    pand  t2, mpb_01
+    psubb t1, t2
+    pavgb t1, p1
+    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
+    psrlw t3, 2
+    pavgb t3, mpb_00
+    pxor  t3, t1
+    pand  t3, mpb_01
+    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
+
+    mova  t3, p0
+    mova  t2, p0
+    pxor  t3, q1
+    pavgb t2, q1
+    pand  t3, mpb_01
+    psubb t2, t3
+    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
+
+    pxor  t1, t2
+    pxor  t2, p0
+    pand  t1, mask1p
+    pand  t2, mask0
+    pxor  t1, t2
+    pxor  t1, p0
+    mova  %1, t1 ; store p0
+
+    mova  t1, %4 ; p3
+    mova  t2, t1
+    pavgb t1, p2
+    paddb t2, p2
+    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
+    paddb t2, t2
+    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
+    psrlw t2, 2
+    pavgb t2, mpb_00
+    pxor  t2, t1
+    pand  t2, mpb_01
+    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
+
+    pxor  t0, p1
+    pxor  t1, p2
+    pand  t0, mask1p
+    pand  t1, mask1p
+    pxor  t0, p1
+    pxor  t1, p2
+    mova  %2, t0 ; store p1
+    mova  %3, t1 ; store p2
+%endmacro
+
+%macro LUMA_INTRA_SWAP_PQ 0
+    %define q1 m0
+    %define q0 m1
+    %define p0 m2
+    %define p1 m3
+    %define p2 q2
+    %define mask1p mask1q
+%endmacro
+
+%macro DEBLOCK_LUMA_INTRA 2
+    %define p1 m0
+    %define p0 m1
+    %define q0 m2
+    %define q1 m3
+    %define t0 m4
+    %define t1 m5
+    %define t2 m6
+    %define t3 m7
+%ifdef ARCH_X86_64
+    %define p2 m8
+    %define q2 m9
+    %define t4 m10
+    %define t5 m11
+    %define mask0 m12
+    %define mask1p m13
+    %define mask1q [rsp-24]
+    %define mpb_00 m14
+    %define mpb_01 m15
+%else
+    %define spill(x) [esp+16*x+((stack_offset+4)&15)]
+    %define p2 [r4+r1]
+    %define q2 [r0+2*r1]
+    %define t4 spill(0)
+    %define t5 spill(1)
+    %define mask0 spill(2)
+    %define mask1p spill(3)
+    %define mask1q spill(4)
+    %define mpb_00 [pb_00 GLOBAL]
+    %define mpb_01 [pb_01 GLOBAL]
+%endif
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_%2_luma_intra_%1, 4,6
+%ifndef ARCH_X86_64
+    sub     esp, 0x60
+%endif
+    lea     r4, [r1*4]
+    lea     r5, [r1*3] ; 3*stride
+    dec     r2d        ; alpha-1
+    jl .end
+    neg     r4
+    dec     r3d        ; beta-1
+    jl .end
+    add     r4, r0     ; pix-4*stride
+    mova    p1, [r4+2*r1]
+    mova    p0, [r4+r5]
+    mova    q0, [r0]
+    mova    q1, [r0+r1]
+%ifdef ARCH_X86_64
+    pxor    mpb_00, mpb_00
+    mova    mpb_01, [pb_01 GLOBAL]
+    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
+    SWAP    7, 12 ; m12=mask0
+    pavgb   t5, mpb_00
+    pavgb   t5, mpb_01 ; alpha/4+1
+    movdqa  p2, [r4+r1]
+    movdqa  q2, [r0+2*r1]
+    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
+    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
+    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
+    pand    t0, mask0
+    pand    t4, t0
+    pand    t2, t0
+    mova    mask1q, t4
+    mova    mask1p, t2
+%else
+    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
+    mova    m4, t5
+    mova    mask0, m7
+    pavgb   m4, [pb_00 GLOBAL]
+    pavgb   m4, [pb_01 GLOBAL] ; alpha/4+1
+    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
+    pand    m6, mask0
+    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
+    pand    m4, m6
+    mova    mask1p, m4
+    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
+    pand    m4, m6
+    mova    mask1q, m4
+%endif
+    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
+    LUMA_INTRA_SWAP_PQ
+    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
+.end:
+%ifndef ARCH_X86_64
+    add     esp, 0x60
+%endif
+    RET
+
+INIT_MMX
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_h_luma_intra_%1
+    movsxd r10, r1d
+    lea    r11, [r10*3]
+    lea    rax, [r0-4]
+    lea    r9,  [r0-4+r11]
+    sub    rsp, 0x88
+    %define pix_tmp rsp
+
+    ; transpose 8x16 -> tmp space
+    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+    lea    rax, [rax+r10*8]
+    lea    r9,  [r9+r10*8]
+    TRANSPOSE8x8_MEM  PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+
+    lea    r0,  [pix_tmp+0x40]
+    mov    r1,  0x10
+    call   x264_deblock_v_luma_intra_%1
+
+    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
+    lea    r9, [rax+r11]
+    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
+    shl    r10, 3
+    sub    rax, r10
+    sub    r9,  r10
+    shr    r10, 3
+    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
+    add    rsp, 0x88
+    ret
+%else
+cglobal x264_deblock_h_luma_intra_%1, 2,4
+    lea    r3,  [r1*3]
+    sub    r0,  4
+    lea    r2,  [r0+r3]
+%assign pad 0x8c-(stack_offset&15)
+    SUB    rsp, pad
+    %define pix_tmp rsp
+
+    ; transpose 8x16 -> tmp space
+    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+    lea    r0,  [r0+r1*8]
+    lea    r2,  [r2+r1*8]
+    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+
+    lea    r0,  [pix_tmp+0x40]
+    PUSH   dword r3m
+    PUSH   dword r2m
+    PUSH   dword 16
+    PUSH   r0
+    call   x264_deblock_%2_luma_intra_%1
+%ifidn %2, v8
+    add    dword [rsp], 8 ; pix_tmp+8
+    call   x264_deblock_%2_luma_intra_%1
+%endif
+    ADD    esp, 16
+
+    mov    r1,  r1m
+    mov    r0,  r0m
+    lea    r3,  [r1*3]
+    sub    r0,  4
+    lea    r2,  [r0+r3]
+    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
+    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
+    lea    r0,  [r0+r1*8]
+    lea    r2,  [r2+r1*8]
+    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
+    ADD    rsp, pad
+    RET
+%endif ; ARCH_X86_64
+%endmacro ; DEBLOCK_LUMA_INTRA
+
+INIT_XMM
+DEBLOCK_LUMA_INTRA sse2, v
+%ifndef ARCH_X86_64
+INIT_MMX
+DEBLOCK_LUMA_INTRA mmxext, v8
+%endif
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
new file mode 100644
index 0000000..909c274
--- /dev/null
+++ b/libavcodec/x86/h264_i386.h
@@ -0,0 +1,155 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file h264_i386.h
+ * H.264 / AVC / MPEG4 part10 codec.
+ * non-MMX i386-specific optimizations for H.264
+ * @author Michael Niedermayer <michaelni at gmx.at>
+ */
+
+#ifndef AVCODEC_X86_H264_I386_H
+#define AVCODEC_X86_H264_I386_H
+
+#include "libavcodec/cabac.h"
+
+//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
+//as that would make optimization work hard)
+#if defined(ARCH_X86) && defined(HAVE_7REGS)     && \
+    defined(HAVE_EBX_AVAILABLE)                  && \
+    !defined(BROKEN_RELOCATIONS)
+static int decode_significance_x86(CABACContext *c, int max_coeff,
+                                   uint8_t *significant_coeff_ctx_base,
+                                   int *index){
+    void *end= significant_coeff_ctx_base + max_coeff - 1;
+    int minusstart= -(int)significant_coeff_ctx_base;
+    int minusindex= 4-(int)index;
+    int coeff_count;
+    __asm__ volatile(
+        "movl "RANGE    "(%3), %%esi            \n\t"
+        "movl "LOW      "(%3), %%ebx            \n\t"
+
+        "2:                                     \n\t"
+
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx",
+                             "%%bx", "%%esi", "%%eax", "%%al")
+
+        "test $1, %%edx                         \n\t"
+        " jz 3f                                 \n\t"
+
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx",
+                             "%%bx", "%%esi", "%%eax", "%%al")
+
+        "mov  %2, %%"REG_a"                     \n\t"
+        "movl %4, %%ecx                         \n\t"
+        "add  %1, %%"REG_c"                     \n\t"
+        "movl %%ecx, (%%"REG_a")                \n\t"
+
+        "test $1, %%edx                         \n\t"
+        " jnz 4f                                \n\t"
+
+        "add  $4, %%"REG_a"                     \n\t"
+        "mov  %%"REG_a", %2                     \n\t"
+
+        "3:                                     \n\t"
+        "add  $1, %1                            \n\t"
+        "cmp  %5, %1                            \n\t"
+        " jb 2b                                 \n\t"
+        "mov  %2, %%"REG_a"                     \n\t"
+        "movl %4, %%ecx                         \n\t"
+        "add  %1, %%"REG_c"                     \n\t"
+        "movl %%ecx, (%%"REG_a")                \n\t"
+        "4:                                     \n\t"
+        "add  %6, %%eax                         \n\t"
+        "shr $2, %%eax                          \n\t"
+
+        "movl %%esi, "RANGE    "(%3)            \n\t"
+        "movl %%ebx, "LOW      "(%3)            \n\t"
+        :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)
+        :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)
+        : "%"REG_c, "%ebx", "%edx", "%esi", "memory"
+    );
+    return coeff_count;
+}
+
+static int decode_significance_8x8_x86(CABACContext *c,
+                                       uint8_t *significant_coeff_ctx_base,
+                                       int *index, const uint8_t *sig_off){
+    int minusindex= 4-(int)index;
+    int coeff_count;
+    x86_reg last=0;
+    __asm__ volatile(
+        "movl "RANGE    "(%3), %%esi            \n\t"
+        "movl "LOW      "(%3), %%ebx            \n\t"
+
+        "mov %1, %%"REG_D"                      \n\t"
+        "2:                                     \n\t"
+
+        "mov %6, %%"REG_a"                      \n\t"
+        "movzbl (%%"REG_a", %%"REG_D"), %%edi   \n\t"
+        "add %5, %%"REG_D"                      \n\t"
+
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx",
+                             "%%bx", "%%esi", "%%eax", "%%al")
+
+        "mov %1, %%edi                          \n\t"
+        "test $1, %%edx                         \n\t"
+        " jz 3f                                 \n\t"
+
+        "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t"
+        "add %5, %%"REG_D"                      \n\t"
+
+        BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx",
+                             "%%bx", "%%esi", "%%eax", "%%al")
+
+        "mov %2, %%"REG_a"                      \n\t"
+        "mov %1, %%edi                          \n\t"
+        "movl %%edi, (%%"REG_a")                \n\t"
+
+        "test $1, %%edx                         \n\t"
+        " jnz 4f                                \n\t"
+
+        "add $4, %%"REG_a"                      \n\t"
+        "mov %%"REG_a", %2                      \n\t"
+
+        "3:                                     \n\t"
+        "addl $1, %%edi                         \n\t"
+        "mov %%edi, %1                          \n\t"
+        "cmpl $63, %%edi                        \n\t"
+        " jb 2b                                 \n\t"
+        "mov %2, %%"REG_a"                      \n\t"
+        "movl %%edi, (%%"REG_a")                \n\t"
+        "4:                                     \n\t"
+        "addl %4, %%eax                         \n\t"
+        "shr $2, %%eax                          \n\t"
+
+        "movl %%esi, "RANGE    "(%3)            \n\t"
+        "movl %%ebx, "LOW      "(%3)            \n\t"
+        :"=&a"(coeff_count),"+m"(last), "+m"(index)
+        :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)
+        : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"
+    );
+    return coeff_count;
+}
+#endif /* defined(ARCH_X86) && defined(HAVE_7REGS) &&                 */
+       /* defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS) */
+
+#endif /* AVCODEC_X86_H264_I386_H */
diff --git a/libavcodec/x86/h264_idct_sse2.asm b/libavcodec/x86/h264_idct_sse2.asm
new file mode 100755
index 0000000..a46cd97
--- /dev/null
+++ b/libavcodec/x86/h264_idct_sse2.asm
@@ -0,0 +1,61 @@
+;*****************************************************************************
+;* dct-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2003-2008 x264 project
+;*
+;* Authors: Laurent Aimar <fenrir at via.ecp.fr>
+;*          Loren Merritt <lorenm at u.washington.edu>
+;*          Holger Lubitz <hal at duncan.ol.sub.de>
+;*          Min Chen <chenm001.163.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+pw_32: times 8 dw 32
+
+SECTION .text
+
+%macro IDCT4_1D 6
+    SUMSUB_BA   m%3, m%1
+    SUMSUBD2_AB m%2, m%4, m%6, m%5
+    SUMSUB_BADC m%2, m%3, m%5, m%1
+    SWAP %1, %2, %5, %4, %3
+%endmacro
+
+INIT_XMM
+cglobal x264_add8x4_idct_sse2, 3,3
+    movq   m0, [r1+ 0]
+    movq   m1, [r1+ 8]
+    movq   m2, [r1+16]
+    movq   m3, [r1+24]
+    movhps m0, [r1+32]
+    movhps m1, [r1+40]
+    movhps m2, [r1+48]
+    movhps m3, [r1+56]
+    IDCT4_1D 0,1,2,3,4,5
+    TRANSPOSE2x4x4W 0,1,2,3,4
+    paddw m0, [pw_32 GLOBAL]
+    IDCT4_1D 0,1,2,3,4,5
+    pxor  m7, m7
+    STORE_DIFF  m0, m4, m7, [r0]
+    STORE_DIFF  m1, m4, m7, [r0+r2]
+    lea   r0, [r0+r2*2]
+    STORE_DIFF  m2, m4, m7, [r0]
+    STORE_DIFF  m3, m4, m7, [r0+r2]
+    RET
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
new file mode 100644
index 0000000..8eeb657
--- /dev/null
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -0,0 +1,2296 @@
+/*
+ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dsputil_mmx.h"
+
+DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
+DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3  ) = 0x0307030703070307ULL;
+
+/***********************************/
+/* IDCT */
+
+#define SUMSUB_BADC( a, b, c, d ) \
+    "paddw "#b", "#a" \n\t"\
+    "paddw "#d", "#c" \n\t"\
+    "paddw "#b", "#b" \n\t"\
+    "paddw "#d", "#d" \n\t"\
+    "psubw "#a", "#b" \n\t"\
+    "psubw "#c", "#d" \n\t"
+
+#define SUMSUBD2_AB( a, b, t ) \
+    "movq  "#b", "#t" \n\t"\
+    "psraw  $1 , "#b" \n\t"\
+    "paddw "#a", "#b" \n\t"\
+    "psraw  $1 , "#a" \n\t"\
+    "psubw "#t", "#a" \n\t"
+
+#define IDCT4_1D( s02, s13, d02, d13, t ) \
+    SUMSUB_BA  ( s02, d02 )\
+    SUMSUBD2_AB( s13, d13, t )\
+    SUMSUB_BADC( d13, s02, s13, d02 )
+
+#define STORE_DIFF_4P( p, t, z ) \
+    "psraw      $6,     "#p" \n\t"\
+    "movd       (%0),   "#t" \n\t"\
+    "punpcklbw "#z",    "#t" \n\t"\
+    "paddsw    "#t",    "#p" \n\t"\
+    "packuswb  "#z",    "#p" \n\t"\
+    "movd      "#p",    (%0) \n\t"
+
+static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+    /* Load dct coeffs */
+    __asm__ volatile(
+        "movq   (%0), %%mm0 \n\t"
+        "movq  8(%0), %%mm1 \n\t"
+        "movq 16(%0), %%mm2 \n\t"
+        "movq 24(%0), %%mm3 \n\t"
+    :: "r"(block) );
+
+    __asm__ volatile(
+        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
+        IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
+
+        "movq      %0,    %%mm6 \n\t"
+        /* in: 1,4,0,2  out: 1,2,3,0 */
+        TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
+
+        "paddw     %%mm6, %%mm3 \n\t"
+
+        /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
+        IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
+
+        "pxor %%mm7, %%mm7    \n\t"
+    :: "m"(ff_pw_32));
+
+    __asm__ volatile(
+    STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
+        "add %1, %0             \n\t"
+    STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
+        "add %1, %0             \n\t"
+    STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
+        "add %1, %0             \n\t"
+    STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
+        : "+r"(dst)
+        : "r" ((x86_reg)stride)
+    );
+}
+
+static inline void h264_idct8_1d(int16_t *block)
+{
+    __asm__ volatile(
+        "movq 112(%0), %%mm7  \n\t"
+        "movq  80(%0), %%mm0  \n\t"
+        "movq  48(%0), %%mm3  \n\t"
+        "movq  16(%0), %%mm5  \n\t"
+
+        "movq   %%mm0, %%mm4  \n\t"
+        "movq   %%mm5, %%mm1  \n\t"
+        "psraw  $1,    %%mm4  \n\t"
+        "psraw  $1,    %%mm1  \n\t"
+        "paddw  %%mm0, %%mm4  \n\t"
+        "paddw  %%mm5, %%mm1  \n\t"
+        "paddw  %%mm7, %%mm4  \n\t"
+        "paddw  %%mm0, %%mm1  \n\t"
+        "psubw  %%mm5, %%mm4  \n\t"
+        "paddw  %%mm3, %%mm1  \n\t"
+
+        "psubw  %%mm3, %%mm5  \n\t"
+        "psubw  %%mm3, %%mm0  \n\t"
+        "paddw  %%mm7, %%mm5  \n\t"
+        "psubw  %%mm7, %%mm0  \n\t"
+        "psraw  $1,    %%mm3  \n\t"
+        "psraw  $1,    %%mm7  \n\t"
+        "psubw  %%mm3, %%mm5  \n\t"
+        "psubw  %%mm7, %%mm0  \n\t"
+
+        "movq   %%mm4, %%mm3  \n\t"
+        "movq   %%mm1, %%mm7  \n\t"
+        "psraw  $2,    %%mm1  \n\t"
+        "psraw  $2,    %%mm3  \n\t"
+        "paddw  %%mm5, %%mm3  \n\t"
+        "psraw  $2,    %%mm5  \n\t"
+        "paddw  %%mm0, %%mm1  \n\t"
+        "psraw  $2,    %%mm0  \n\t"
+        "psubw  %%mm4, %%mm5  \n\t"
+        "psubw  %%mm0, %%mm7  \n\t"
+
+        "movq  32(%0), %%mm2  \n\t"
+        "movq  96(%0), %%mm6  \n\t"
+        "movq   %%mm2, %%mm4  \n\t"
+        "movq   %%mm6, %%mm0  \n\t"
+        "psraw  $1,    %%mm4  \n\t"
+        "psraw  $1,    %%mm6  \n\t"
+        "psubw  %%mm0, %%mm4  \n\t"
+        "paddw  %%mm2, %%mm6  \n\t"
+
+        "movq    (%0), %%mm2  \n\t"
+        "movq  64(%0), %%mm0  \n\t"
+        SUMSUB_BA( %%mm0, %%mm2 )
+        SUMSUB_BA( %%mm6, %%mm0 )
+        SUMSUB_BA( %%mm4, %%mm2 )
+        SUMSUB_BA( %%mm7, %%mm6 )
+        SUMSUB_BA( %%mm5, %%mm4 )
+        SUMSUB_BA( %%mm3, %%mm2 )
+        SUMSUB_BA( %%mm1, %%mm0 )
+        :: "r"(block)
+    );
+}
+
+static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+    int i;
+    int16_t __attribute__ ((aligned(8))) b2[64];
+
+    block[0] += 32;
+
+    for(i=0; i<2; i++){
+        DECLARE_ALIGNED_8(uint64_t, tmp);
+
+        h264_idct8_1d(block+4*i);
+
+        __asm__ volatile(
+            "movq   %%mm7,    %0   \n\t"
+            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
+            "movq   %%mm0,  8(%1)  \n\t"
+            "movq   %%mm6, 24(%1)  \n\t"
+            "movq   %%mm7, 40(%1)  \n\t"
+            "movq   %%mm4, 56(%1)  \n\t"
+            "movq    %0,    %%mm7  \n\t"
+            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
+            "movq   %%mm7,   (%1)  \n\t"
+            "movq   %%mm1, 16(%1)  \n\t"
+            "movq   %%mm0, 32(%1)  \n\t"
+            "movq   %%mm3, 48(%1)  \n\t"
+            : "=m"(tmp)
+            : "r"(b2+32*i)
+            : "memory"
+        );
+    }
+
+    for(i=0; i<2; i++){
+        h264_idct8_1d(b2+4*i);
+
+        __asm__ volatile(
+            "psraw     $6, %%mm7  \n\t"
+            "psraw     $6, %%mm6  \n\t"
+            "psraw     $6, %%mm5  \n\t"
+            "psraw     $6, %%mm4  \n\t"
+            "psraw     $6, %%mm3  \n\t"
+            "psraw     $6, %%mm2  \n\t"
+            "psraw     $6, %%mm1  \n\t"
+            "psraw     $6, %%mm0  \n\t"
+
+            "movq   %%mm7,    (%0)  \n\t"
+            "movq   %%mm5,  16(%0)  \n\t"
+            "movq   %%mm3,  32(%0)  \n\t"
+            "movq   %%mm1,  48(%0)  \n\t"
+            "movq   %%mm0,  64(%0)  \n\t"
+            "movq   %%mm2,  80(%0)  \n\t"
+            "movq   %%mm4,  96(%0)  \n\t"
+            "movq   %%mm6, 112(%0)  \n\t"
+            :: "r"(b2+4*i)
+            : "memory"
+        );
+    }
+
+    add_pixels_clamped_mmx(b2, dst, stride);
+}
+
+#define STORE_DIFF_8P( p, d, t, z )\
+        "movq       "#d", "#t" \n"\
+        "psraw       $6,  "#p" \n"\
+        "punpcklbw  "#z", "#t" \n"\
+        "paddsw     "#t", "#p" \n"\
+        "packuswb   "#p", "#p" \n"\
+        "movq       "#p", "#d" \n"
+
+#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
+        "movdqa     "#c", "#a" \n"\
+        "movdqa     "#g", "#e" \n"\
+        "psraw       $1,  "#c" \n"\
+        "psraw       $1,  "#g" \n"\
+        "psubw      "#e", "#c" \n"\
+        "paddw      "#a", "#g" \n"\
+        "movdqa     "#b", "#e" \n"\
+        "psraw       $1,  "#e" \n"\
+        "paddw      "#b", "#e" \n"\
+        "paddw      "#d", "#e" \n"\
+        "paddw      "#f", "#e" \n"\
+        "movdqa     "#f", "#a" \n"\
+        "psraw       $1,  "#a" \n"\
+        "paddw      "#f", "#a" \n"\
+        "paddw      "#h", "#a" \n"\
+        "psubw      "#b", "#a" \n"\
+        "psubw      "#d", "#b" \n"\
+        "psubw      "#d", "#f" \n"\
+        "paddw      "#h", "#b" \n"\
+        "psubw      "#h", "#f" \n"\
+        "psraw       $1,  "#d" \n"\
+        "psraw       $1,  "#h" \n"\
+        "psubw      "#d", "#b" \n"\
+        "psubw      "#h", "#f" \n"\
+        "movdqa     "#e", "#d" \n"\
+        "movdqa     "#a", "#h" \n"\
+        "psraw       $2,  "#d" \n"\
+        "psraw       $2,  "#h" \n"\
+        "paddw      "#f", "#d" \n"\
+        "paddw      "#b", "#h" \n"\
+        "psraw       $2,  "#f" \n"\
+        "psraw       $2,  "#b" \n"\
+        "psubw      "#f", "#e" \n"\
+        "psubw      "#a", "#b" \n"\
+        "movdqa 0x00(%1), "#a" \n"\
+        "movdqa 0x40(%1), "#f" \n"\
+        SUMSUB_BA(f, a)\
+        SUMSUB_BA(g, f)\
+        SUMSUB_BA(c, a)\
+        SUMSUB_BA(e, g)\
+        SUMSUB_BA(b, c)\
+        SUMSUB_BA(h, a)\
+        SUMSUB_BA(d, f)
+
+static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
+{
+    __asm__ volatile(
+        "movdqa   0x10(%1), %%xmm1 \n"
+        "movdqa   0x20(%1), %%xmm2 \n"
+        "movdqa   0x30(%1), %%xmm3 \n"
+        "movdqa   0x50(%1), %%xmm5 \n"
+        "movdqa   0x60(%1), %%xmm6 \n"
+        "movdqa   0x70(%1), %%xmm7 \n"
+        H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
+        TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
+        "paddw          %4, %%xmm4 \n"
+        "movdqa     %%xmm4, 0x00(%1) \n"
+        "movdqa     %%xmm2, 0x40(%1) \n"
+        H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
+        "movdqa     %%xmm6, 0x60(%1) \n"
+        "movdqa     %%xmm7, 0x70(%1) \n"
+        "pxor       %%xmm7, %%xmm7 \n"
+        STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
+        "lea     (%0,%2,4), %0 \n"
+        STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
+        "movdqa   0x60(%1), %%xmm0 \n"
+        "movdqa   0x70(%1), %%xmm1 \n"
+        STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
+        :"+r"(dst)
+        :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
+    );
+}
+
+static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    __asm__ volatile(
+        "movd          %0, %%mm0 \n\t"
+        "pshufw $0, %%mm0, %%mm0 \n\t"
+        "pxor       %%mm1, %%mm1 \n\t"
+        "psubw      %%mm0, %%mm1 \n\t"
+        "packuswb   %%mm0, %%mm0 \n\t"
+        "packuswb   %%mm1, %%mm1 \n\t"
+        ::"r"(dc)
+    );
+    __asm__ volatile(
+        "movd          %0, %%mm2 \n\t"
+        "movd          %1, %%mm3 \n\t"
+        "movd          %2, %%mm4 \n\t"
+        "movd          %3, %%mm5 \n\t"
+        "paddusb    %%mm0, %%mm2 \n\t"
+        "paddusb    %%mm0, %%mm3 \n\t"
+        "paddusb    %%mm0, %%mm4 \n\t"
+        "paddusb    %%mm0, %%mm5 \n\t"
+        "psubusb    %%mm1, %%mm2 \n\t"
+        "psubusb    %%mm1, %%mm3 \n\t"
+        "psubusb    %%mm1, %%mm4 \n\t"
+        "psubusb    %%mm1, %%mm5 \n\t"
+        "movd       %%mm2, %0    \n\t"
+        "movd       %%mm3, %1    \n\t"
+        "movd       %%mm4, %2    \n\t"
+        "movd       %%mm5, %3    \n\t"
+        :"+m"(*(uint32_t*)(dst+0*stride)),
+         "+m"(*(uint32_t*)(dst+1*stride)),
+         "+m"(*(uint32_t*)(dst+2*stride)),
+         "+m"(*(uint32_t*)(dst+3*stride))
+    );
+}
+
+static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    int dc = (block[0] + 32) >> 6;
+    int y;
+    __asm__ volatile(
+        "movd          %0, %%mm0 \n\t"
+        "pshufw $0, %%mm0, %%mm0 \n\t"
+        "pxor       %%mm1, %%mm1 \n\t"
+        "psubw      %%mm0, %%mm1 \n\t"
+        "packuswb   %%mm0, %%mm0 \n\t"
+        "packuswb   %%mm1, %%mm1 \n\t"
+        ::"r"(dc)
+    );
+    for(y=2; y--; dst += 4*stride){
+    __asm__ volatile(
+        "movq          %0, %%mm2 \n\t"
+        "movq          %1, %%mm3 \n\t"
+        "movq          %2, %%mm4 \n\t"
+        "movq          %3, %%mm5 \n\t"
+        "paddusb    %%mm0, %%mm2 \n\t"
+        "paddusb    %%mm0, %%mm3 \n\t"
+        "paddusb    %%mm0, %%mm4 \n\t"
+        "paddusb    %%mm0, %%mm5 \n\t"
+        "psubusb    %%mm1, %%mm2 \n\t"
+        "psubusb    %%mm1, %%mm3 \n\t"
+        "psubusb    %%mm1, %%mm4 \n\t"
+        "psubusb    %%mm1, %%mm5 \n\t"
+        "movq       %%mm2, %0    \n\t"
+        "movq       %%mm3, %1    \n\t"
+        "movq       %%mm4, %2    \n\t"
+        "movq       %%mm5, %3    \n\t"
+        :"+m"(*(uint64_t*)(dst+0*stride)),
+         "+m"(*(uint64_t*)(dst+1*stride)),
+         "+m"(*(uint64_t*)(dst+2*stride)),
+         "+m"(*(uint64_t*)(dst+3*stride))
+    );
+    }
+}
+
+//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
+static const uint8_t scan8[16 + 2*4]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+};
+
+static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+
+static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ] || block[i*16])
+            ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i++){
+        if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
+        else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct8_add_mmx    (dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=4){
+        int nnz = nnzc[ scan8[i] ];
+        if(nnz){
+            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+            else                      ff_h264_idct8_add_sse2   (dst + block_offset[i], block + i*16, stride);
+        }
+    }
+}
+
+static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i] ] || block[i*16])
+            ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+    }
+}
+
+static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i] ])
+            ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+        else if(block[i*16])
+            ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+    }
+}
+
+#if defined(CONFIG_GPL) && defined(HAVE_YASM)
+static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    __asm__ volatile(
+        "movd             %0, %%mm0 \n\t"   //  0 0 X D
+        "punpcklwd        %1, %%mm0 \n\t"   //  x X d D
+        "paddsw           %2, %%mm0 \n\t"
+        "psraw            $6, %%mm0 \n\t"
+        "punpcklwd     %%mm0, %%mm0 \n\t"   //  d d D D
+        "pxor          %%mm1, %%mm1 \n\t"   //  0 0 0 0
+        "psubw         %%mm0, %%mm1 \n\t"   // -d-d-D-D
+        "packuswb      %%mm1, %%mm0 \n\t"   // -d-d-D-D d d D D
+        "pshufw $0xFA, %%mm0, %%mm1 \n\t"   // -d-d-d-d-D-D-D-D
+        "punpcklwd     %%mm0, %%mm0 \n\t"   //  d d d d D D D D
+        ::"m"(block[ 0]),
+          "m"(block[16]),
+          "m"(ff_pw_32)
+    );
+    __asm__ volatile(
+        "movq          %0, %%mm2 \n\t"
+        "movq          %1, %%mm3 \n\t"
+        "movq          %2, %%mm4 \n\t"
+        "movq          %3, %%mm5 \n\t"
+        "paddusb    %%mm0, %%mm2 \n\t"
+        "paddusb    %%mm0, %%mm3 \n\t"
+        "paddusb    %%mm0, %%mm4 \n\t"
+        "paddusb    %%mm0, %%mm5 \n\t"
+        "psubusb    %%mm1, %%mm2 \n\t"
+        "psubusb    %%mm1, %%mm3 \n\t"
+        "psubusb    %%mm1, %%mm4 \n\t"
+        "psubusb    %%mm1, %%mm5 \n\t"
+        "movq       %%mm2, %0    \n\t"
+        "movq       %%mm3, %1    \n\t"
+        "movq       %%mm4, %2    \n\t"
+        "movq       %%mm5, %3    \n\t"
+        :"+m"(*(uint64_t*)(dst+0*stride)),
+         "+m"(*(uint64_t*)(dst+1*stride)),
+         "+m"(*(uint64_t*)(dst+2*stride)),
+         "+m"(*(uint64_t*)(dst+3*stride))
+    );
+}
+
+extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride);
+
+static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=2)
+        if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
+            ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
+}
+
+static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=2){
+        if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
+            ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
+        else if(block[i*16]|block[i*16+16])
+            ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
+            ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+        else if(block[i*16]|block[i*16+16])
+            ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+    }
+}
+#endif
+
+/***********************************/
+/* deblocking */
+
+// out: o = |x-y|>a
+// clobbers: t
+#define DIFF_GT_MMX(x,y,a,o,t)\
+    "movq     "#y", "#t"  \n\t"\
+    "movq     "#x", "#o"  \n\t"\
+    "psubusb  "#x", "#t"  \n\t"\
+    "psubusb  "#y", "#o"  \n\t"\
+    "por      "#t", "#o"  \n\t"\
+    "psubusb  "#a", "#o"  \n\t"
+
+// out: o = |x-y|>a
+// clobbers: t
+#define DIFF_GT2_MMX(x,y,a,o,t)\
+    "movq     "#y", "#t"  \n\t"\
+    "movq     "#x", "#o"  \n\t"\
+    "psubusb  "#x", "#t"  \n\t"\
+    "psubusb  "#y", "#o"  \n\t"\
+    "psubusb  "#a", "#t"  \n\t"\
+    "psubusb  "#a", "#o"  \n\t"\
+    "pcmpeqb  "#t", "#o"  \n\t"\
+
+// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
+// out: mm5=beta-1, mm7=mask
+// clobbers: mm4,mm6
+#define H264_DEBLOCK_MASK(alpha1, beta1) \
+    "pshufw $0, "#alpha1", %%mm4 \n\t"\
+    "pshufw $0, "#beta1 ", %%mm5 \n\t"\
+    "packuswb  %%mm4, %%mm4      \n\t"\
+    "packuswb  %%mm5, %%mm5      \n\t"\
+    DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
+    DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
+    "por       %%mm4, %%mm7      \n\t"\
+    DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
+    "por       %%mm4, %%mm7      \n\t"\
+    "pxor      %%mm6, %%mm6      \n\t"\
+    "pcmpeqb   %%mm6, %%mm7      \n\t"
+
+// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
+// out: mm1=p0' mm2=q0'
+// clobbers: mm0,3-6
+#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
+        "movq    %%mm1              , %%mm5 \n\t"\
+        "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
+        "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
+        "pcmpeqb %%mm4              , %%mm4 \n\t"\
+        "pxor    %%mm4              , %%mm3 \n\t"\
+        "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
+        "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
+        "pxor    %%mm1              , %%mm4 \n\t"\
+        "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
+        "pavgb   %%mm5              , %%mm3 \n\t"\
+        "paddusb %%mm4              , %%mm3 \n\t" /* d+128+33*/\
+        "movq    "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
+        "psubusb %%mm3              , %%mm6 \n\t"\
+        "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
+        "pminub  %%mm7              , %%mm6 \n\t"\
+        "pminub  %%mm7              , %%mm3 \n\t"\
+        "psubusb %%mm6              , %%mm1 \n\t"\
+        "psubusb %%mm3              , %%mm2 \n\t"\
+        "paddusb %%mm3              , %%mm1 \n\t"\
+        "paddusb %%mm6              , %%mm2 \n\t"
+
+// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
+// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
+// clobbers: q2, tmp, tc0
+#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
+        "movq     %%mm1,  "#tmp"   \n\t"\
+        "pavgb    %%mm2,  "#tmp"   \n\t"\
+        "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
+        "pxor   "q2addr", "#tmp"   \n\t"\
+        "pand     %8,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
+        "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
+        "movq     "#p1",  "#tmp"   \n\t"\
+        "psubusb  "#tc0", "#tmp"   \n\t"\
+        "paddusb  "#p1",  "#tc0"   \n\t"\
+        "pmaxub   "#tmp", "#q2"    \n\t"\
+        "pminub   "#tc0", "#q2"    \n\t"\
+        "movq     "#q2",  "q1addr" \n\t"
+
+static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
+{
+    DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
+
+    __asm__ volatile(
+        "movq    (%1,%3), %%mm0    \n\t" //p1
+        "movq    (%1,%3,2), %%mm1  \n\t" //p0
+        "movq    (%2),    %%mm2    \n\t" //q0
+        "movq    (%2,%3), %%mm3    \n\t" //q1
+        H264_DEBLOCK_MASK(%6, %7)
+
+        "movd      %5,    %%mm4    \n\t"
+        "punpcklbw %%mm4, %%mm4    \n\t"
+        "punpcklwd %%mm4, %%mm4    \n\t"
+        "pcmpeqb   %%mm3, %%mm3    \n\t"
+        "movq      %%mm4, %%mm6    \n\t"
+        "pcmpgtb   %%mm3, %%mm4    \n\t"
+        "movq      %%mm6, 8+%0     \n\t"
+        "pand      %%mm4, %%mm7    \n\t"
+        "movq      %%mm7, %0       \n\t"
+
+        /* filter p1 */
+        "movq     (%1),   %%mm3    \n\t" //p2
+        DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
+        "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
+        "pand     8+%0,   %%mm7    \n\t" // mask & tc0
+        "movq     %%mm7,  %%mm4    \n\t"
+        "psubb    %%mm6,  %%mm7    \n\t"
+        "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
+        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
+
+        /* filter q1 */
+        "movq    (%2,%3,2), %%mm4  \n\t" //q2
+        DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
+        "pand     %0,     %%mm6    \n\t"
+        "movq     8+%0,   %%mm5    \n\t" // can be merged with the and below but is slower then
+        "pand     %%mm6,  %%mm5    \n\t"
+        "psubb    %%mm6,  %%mm7    \n\t"
+        "movq    (%2,%3), %%mm3    \n\t"
+        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
+
+        /* filter p0, q0 */
+        H264_DEBLOCK_P0_Q0(%8, unused)
+        "movq      %%mm1, (%1,%3,2) \n\t"
+        "movq      %%mm2, (%2)      \n\t"
+
+        : "=m"(*tmp0)
+        : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
+          "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
+          "m"(ff_bone)
+    );
+}
+
+static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    if((tc0[0] & tc0[1]) >= 0)
+        h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
+    if((tc0[2] & tc0[3]) >= 0)
+        h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
+}
+static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    //FIXME: could cut some load/stores by merging transpose with filter
+    // also, it only needs to transpose 6x8
+    DECLARE_ALIGNED_8(uint8_t, trans[8*8]);
+    int i;
+    for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
+        if((tc0[0] & tc0[1]) < 0)
+            continue;
+        transpose4x4(trans,       pix-4,          8, stride);
+        transpose4x4(trans  +4*8, pix,            8, stride);
+        transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
+        transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
+        h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
+        transpose4x4(pix-2,          trans  +2*8, stride, 8);
+        transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
+    }
+}
+
+static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
+{
+    __asm__ volatile(
+        "movq    (%0),    %%mm0     \n\t" //p1
+        "movq    (%0,%2), %%mm1     \n\t" //p0
+        "movq    (%1),    %%mm2     \n\t" //q0
+        "movq    (%1,%2), %%mm3     \n\t" //q1
+        H264_DEBLOCK_MASK(%4, %5)
+        "movd      %3,    %%mm6     \n\t"
+        "punpcklbw %%mm6, %%mm6     \n\t"
+        "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
+        H264_DEBLOCK_P0_Q0(%6, %7)
+        "movq      %%mm1, (%0,%2)   \n\t"
+        "movq      %%mm2, (%1)      \n\t"
+
+        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
+           "r"(*(uint32_t*)tc0),
+           "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
+    );
+}
+
+static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
+}
+
+static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+    //FIXME: could cut some load/stores by merging transpose with filter
+    DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
+    transpose4x4(trans, pix-2, 8, stride);
+    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
+    h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
+    transpose4x4(pix-2, trans, stride, 8);
+    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
+}
+
+// p0 = (p0 + q1 + 2*p1 + 2) >> 2
+#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
+    "movq    "#p0", %%mm4  \n\t"\
+    "pxor    "#q1", %%mm4  \n\t"\
+    "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
+    "pavgb   "#q1", "#p0"  \n\t"\
+    "psubusb %%mm4, "#p0"  \n\t"\
+    "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
+
+static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
+{
+    __asm__ volatile(
+        "movq    (%0),    %%mm0     \n\t"
+        "movq    (%0,%2), %%mm1     \n\t"
+        "movq    (%1),    %%mm2     \n\t"
+        "movq    (%1,%2), %%mm3     \n\t"
+        H264_DEBLOCK_MASK(%3, %4)
+        "movq    %%mm1,   %%mm5     \n\t"
+        "movq    %%mm2,   %%mm6     \n\t"
+        H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
+        H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
+        "psubb   %%mm5,   %%mm1     \n\t"
+        "psubb   %%mm6,   %%mm2     \n\t"
+        "pand    %%mm7,   %%mm1     \n\t"
+        "pand    %%mm7,   %%mm2     \n\t"
+        "paddb   %%mm5,   %%mm1     \n\t"
+        "paddb   %%mm6,   %%mm2     \n\t"
+        "movq    %%mm1,   (%0,%2)   \n\t"
+        "movq    %%mm2,   (%1)      \n\t"
+        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
+           "m"(alpha1), "m"(beta1), "m"(ff_bone)
+    );
+}
+
+static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
+{
+    h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
+}
+
+static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
+{
+    //FIXME: could cut some load/stores by merging transpose with filter
+    DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
+    transpose4x4(trans, pix-2, 8, stride);
+    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
+    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
+    transpose4x4(pix-2, trans, stride, 8);
+    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
+}
+
+static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
+                                            int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
+    int dir;
+    __asm__ volatile(
+        "pxor %%mm7, %%mm7 \n\t"
+        "movq %0, %%mm6 \n\t"
+        "movq %1, %%mm5 \n\t"
+        "movq %2, %%mm4 \n\t"
+        ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
+    );
+    if(field)
+        __asm__ volatile(
+            "movq %0, %%mm5 \n\t"
+            "movq %1, %%mm4 \n\t"
+            ::"m"(ff_pb_3_1), "m"(ff_pb_7_3)
+        );
+
+    // could do a special case for dir==0 && edges==1, but it only reduces the
+    // average filter time by 1.2%
+    for( dir=1; dir>=0; dir-- ) {
+        const int d_idx = dir ? -8 : -1;
+        const int mask_mv = dir ? mask_mv1 : mask_mv0;
+        DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
+        int b_idx, edge, l;
+        for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
+            __asm__ volatile(
+                "pand %0, %%mm0 \n\t"
+                ::"m"(mask_dir)
+            );
+            if(!(mask_mv & edge)) {
+                __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
+                for( l = bidir; l >= 0; l-- ) {
+                    __asm__ volatile(
+                        "movd %0, %%mm1 \n\t"
+                        "punpckldq %1, %%mm1 \n\t"
+                        "movq %%mm1, %%mm2 \n\t"
+                        "psrlw $7, %%mm2 \n\t"
+                        "pand %%mm6, %%mm2 \n\t"
+                        "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1
+                        "punpckldq %%mm1, %%mm2 \n\t"
+                        "pcmpeqb %%mm2, %%mm1 \n\t"
+                        "paddb %%mm6, %%mm1 \n\t"
+                        "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
+                        "por %%mm1, %%mm0 \n\t"
+
+                        "movq %2, %%mm1 \n\t"
+                        "movq %3, %%mm2 \n\t"
+                        "psubw %4, %%mm1 \n\t"
+                        "psubw %5, %%mm2 \n\t"
+                        "packsswb %%mm2, %%mm1 \n\t"
+                        "paddb %%mm5, %%mm1 \n\t"
+                        "pminub %%mm4, %%mm1 \n\t"
+                        "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
+                        "por %%mm1, %%mm0 \n\t"
+                        ::"m"(ref[l][b_idx]),
+                          "m"(ref[l][b_idx+d_idx]),
+                          "m"(mv[l][b_idx][0]),
+                          "m"(mv[l][b_idx+2][0]),
+                          "m"(mv[l][b_idx+d_idx][0]),
+                          "m"(mv[l][b_idx+d_idx+2][0])
+                    );
+                }
+            }
+            __asm__ volatile(
+                "movd %0, %%mm1 \n\t"
+                "por  %1, %%mm1 \n\t"
+                "punpcklbw %%mm7, %%mm1 \n\t"
+                "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn]
+                ::"m"(nnz[b_idx]),
+                  "m"(nnz[b_idx+d_idx])
+            );
+            __asm__ volatile(
+                "pcmpeqw %%mm7, %%mm0 \n\t"
+                "pcmpeqw %%mm7, %%mm0 \n\t"
+                "psrlw $15, %%mm0 \n\t" // nonzero -> 1
+                "psrlw $14, %%mm1 \n\t"
+                "movq %%mm0, %%mm2 \n\t"
+                "por %%mm1, %%mm2 \n\t"
+                "psrlw $1, %%mm1 \n\t"
+                "pandn %%mm2, %%mm1 \n\t"
+                "movq %%mm1, %0 \n\t"
+                :"=m"(*bS[dir][edge])
+                ::"memory"
+            );
+        }
+        edges = 4;
+        step = 1;
+    }
+    __asm__ volatile(
+        "movq   (%0), %%mm0 \n\t"
+        "movq  8(%0), %%mm1 \n\t"
+        "movq 16(%0), %%mm2 \n\t"
+        "movq 24(%0), %%mm3 \n\t"
+        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
+        "movq %%mm0,   (%0) \n\t"
+        "movq %%mm3,  8(%0) \n\t"
+        "movq %%mm4, 16(%0) \n\t"
+        "movq %%mm2, 24(%0) \n\t"
+        ::"r"(bS[0])
+        :"memory"
+    );
+}
+
+/***********************************/
+/* motion compensation */
+
+#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
+        "mov"#q" "#C", "#T"         \n\t"\
+        "mov"#d" (%0), "#F"         \n\t"\
+        "paddw "#D", "#T"           \n\t"\
+        "psllw $2, "#T"             \n\t"\
+        "psubw "#B", "#T"           \n\t"\
+        "psubw "#E", "#T"           \n\t"\
+        "punpcklbw "#Z", "#F"       \n\t"\
+        "pmullw %4, "#T"            \n\t"\
+        "paddw %5, "#A"             \n\t"\
+        "add %2, %0                 \n\t"\
+        "paddw "#F", "#A"           \n\t"\
+        "paddw "#A", "#T"           \n\t"\
+        "psraw $5, "#T"             \n\t"\
+        "packuswb "#T", "#T"        \n\t"\
+        OP(T, (%1), A, d)\
+        "add %3, %1                 \n\t"
+
+#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
+        "mov"#q" "#C", "#T"         \n\t"\
+        "mov"#d" (%0), "#F"         \n\t"\
+        "paddw "#D", "#T"           \n\t"\
+        "psllw $2, "#T"             \n\t"\
+        "paddw %4, "#A"             \n\t"\
+        "psubw "#B", "#T"           \n\t"\
+        "psubw "#E", "#T"           \n\t"\
+        "punpcklbw "#Z", "#F"       \n\t"\
+        "pmullw %3, "#T"            \n\t"\
+        "paddw "#F", "#A"           \n\t"\
+        "add %2, %0                 \n\t"\
+        "paddw "#A", "#T"           \n\t"\
+        "mov"#q" "#T", "#OF"(%1)    \n\t"
+
+#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
+#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
+#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
+#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
+
+
+#define QPEL_H264(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    int h=4;\
+\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movq %5, %%mm4             \n\t"\
+        "movq %6, %%mm5             \n\t"\
+        "1:                         \n\t"\
+        "movd  -1(%0), %%mm1        \n\t"\
+        "movd    (%0), %%mm2        \n\t"\
+        "movd   1(%0), %%mm3        \n\t"\
+        "movd   2(%0), %%mm0        \n\t"\
+        "punpcklbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "paddw %%mm0, %%mm1         \n\t"\
+        "paddw %%mm3, %%mm2         \n\t"\
+        "movd  -2(%0), %%mm0        \n\t"\
+        "movd   3(%0), %%mm3        \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "paddw %%mm3, %%mm0         \n\t"\
+        "psllw $2, %%mm2            \n\t"\
+        "psubw %%mm1, %%mm2         \n\t"\
+        "pmullw %%mm4, %%mm2        \n\t"\
+        "paddw %%mm5, %%mm0         \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "psraw $5, %%mm0            \n\t"\
+        "packuswb %%mm0, %%mm0      \n\t"\
+        OP(%%mm0, (%1),%%mm6, d)\
+        "add %3, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(src), "+c"(dst), "+g"(h)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}\
+static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=4;\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movq %0, %%mm4             \n\t"\
+        "movq %1, %%mm5             \n\t"\
+        :: "m"(ff_pw_5), "m"(ff_pw_16)\
+    );\
+    do{\
+    __asm__ volatile(\
+        "movd  -1(%0), %%mm1        \n\t"\
+        "movd    (%0), %%mm2        \n\t"\
+        "movd   1(%0), %%mm3        \n\t"\
+        "movd   2(%0), %%mm0        \n\t"\
+        "punpcklbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "paddw %%mm0, %%mm1         \n\t"\
+        "paddw %%mm3, %%mm2         \n\t"\
+        "movd  -2(%0), %%mm0        \n\t"\
+        "movd   3(%0), %%mm3        \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "paddw %%mm3, %%mm0         \n\t"\
+        "psllw $2, %%mm2            \n\t"\
+        "psubw %%mm1, %%mm2         \n\t"\
+        "pmullw %%mm4, %%mm2        \n\t"\
+        "paddw %%mm5, %%mm0         \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "movd   (%2), %%mm3         \n\t"\
+        "psraw $5, %%mm0            \n\t"\
+        "packuswb %%mm0, %%mm0      \n\t"\
+        PAVGB" %%mm3, %%mm0         \n\t"\
+        OP(%%mm0, (%1),%%mm6, d)\
+        "add %4, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "add %3, %2                 \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2)\
+        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
+        : "memory"\
+    );\
+    }while(--h);\
+}\
+static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    src -= 2*srcStride;\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movd (%0), %%mm0           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm1           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm2           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm3           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm4           \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpcklbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+         \
+        : "+a"(src), "+c"(dst)\
+        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}\
+static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    int h=4;\
+    int w=3;\
+    src -= 2*srcStride+2;\
+    while(w--){\
+        __asm__ volatile(\
+            "pxor %%mm7, %%mm7      \n\t"\
+            "movd (%0), %%mm0       \n\t"\
+            "add %2, %0             \n\t"\
+            "movd (%0), %%mm1       \n\t"\
+            "add %2, %0             \n\t"\
+            "movd (%0), %%mm2       \n\t"\
+            "add %2, %0             \n\t"\
+            "movd (%0), %%mm3       \n\t"\
+            "add %2, %0             \n\t"\
+            "movd (%0), %%mm4       \n\t"\
+            "add %2, %0             \n\t"\
+            "punpcklbw %%mm7, %%mm0 \n\t"\
+            "punpcklbw %%mm7, %%mm1 \n\t"\
+            "punpcklbw %%mm7, %%mm2 \n\t"\
+            "punpcklbw %%mm7, %%mm3 \n\t"\
+            "punpcklbw %%mm7, %%mm4 \n\t"\
+            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
+            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
+            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
+            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
+             \
+            : "+a"(src)\
+            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+            : "memory"\
+        );\
+        tmp += 4;\
+        src += 4 - 9*srcStride;\
+    }\
+    tmp -= 3*4;\
+    __asm__ volatile(\
+        "1:                         \n\t"\
+        "movq     (%0), %%mm0       \n\t"\
+        "paddw  10(%0), %%mm0       \n\t"\
+        "movq    2(%0), %%mm1       \n\t"\
+        "paddw   8(%0), %%mm1       \n\t"\
+        "movq    4(%0), %%mm2       \n\t"\
+        "paddw   6(%0), %%mm2       \n\t"\
+        "psubw %%mm1, %%mm0         \n\t"/*a-b   (abccba)*/\
+        "psraw $2, %%mm0            \n\t"/*(a-b)/4 */\
+        "psubw %%mm1, %%mm0         \n\t"/*(a-b)/4-b */\
+        "paddsw %%mm2, %%mm0        \n\t"\
+        "psraw $2, %%mm0            \n\t"/*((a-b)/4-b+c)/4 */\
+        "paddw %%mm2, %%mm0         \n\t"/*(a-5*b+20*c)/16 */\
+        "psraw $6, %%mm0            \n\t"\
+        "packuswb %%mm0, %%mm0      \n\t"\
+        OP(%%mm0, (%1),%%mm7, d)\
+        "add $24, %0                \n\t"\
+        "add %3, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(tmp), "+c"(dst), "+g"(h)\
+        : "S"((x86_reg)dstStride)\
+        : "memory"\
+    );\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    int h=8;\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movq %5, %%mm6             \n\t"\
+        "1:                         \n\t"\
+        "movq    (%0), %%mm0        \n\t"\
+        "movq   1(%0), %%mm2        \n\t"\
+        "movq %%mm0, %%mm1          \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpckhbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm3, %%mm1         \n\t"\
+        "psllw $2, %%mm0            \n\t"\
+        "psllw $2, %%mm1            \n\t"\
+        "movq   -1(%0), %%mm2       \n\t"\
+        "movq    2(%0), %%mm4       \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "movq %%mm4, %%mm5          \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        "punpckhbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm4, %%mm2         \n\t"\
+        "paddw %%mm3, %%mm5         \n\t"\
+        "psubw %%mm2, %%mm0         \n\t"\
+        "psubw %%mm5, %%mm1         \n\t"\
+        "pmullw %%mm6, %%mm0        \n\t"\
+        "pmullw %%mm6, %%mm1        \n\t"\
+        "movd   -2(%0), %%mm2       \n\t"\
+        "movd    7(%0), %%mm5       \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm3, %%mm2         \n\t"\
+        "paddw %%mm5, %%mm4         \n\t"\
+        "movq %6, %%mm5             \n\t"\
+        "paddw %%mm5, %%mm2         \n\t"\
+        "paddw %%mm5, %%mm4         \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm4, %%mm1         \n\t"\
+        "psraw $5, %%mm0            \n\t"\
+        "psraw $5, %%mm1            \n\t"\
+        "packuswb %%mm1, %%mm0      \n\t"\
+        OP(%%mm0, (%1),%%mm5, q)\
+        "add %3, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(src), "+c"(dst), "+g"(h)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=8;\
+    __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movq %0, %%mm6             \n\t"\
+        :: "m"(ff_pw_5)\
+    );\
+    do{\
+    __asm__ volatile(\
+        "movq    (%0), %%mm0        \n\t"\
+        "movq   1(%0), %%mm2        \n\t"\
+        "movq %%mm0, %%mm1          \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpckhbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm3, %%mm1         \n\t"\
+        "psllw $2, %%mm0            \n\t"\
+        "psllw $2, %%mm1            \n\t"\
+        "movq   -1(%0), %%mm2       \n\t"\
+        "movq    2(%0), %%mm4       \n\t"\
+        "movq %%mm2, %%mm3          \n\t"\
+        "movq %%mm4, %%mm5          \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpckhbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        "punpckhbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm4, %%mm2         \n\t"\
+        "paddw %%mm3, %%mm5         \n\t"\
+        "psubw %%mm2, %%mm0         \n\t"\
+        "psubw %%mm5, %%mm1         \n\t"\
+        "pmullw %%mm6, %%mm0        \n\t"\
+        "pmullw %%mm6, %%mm1        \n\t"\
+        "movd   -2(%0), %%mm2       \n\t"\
+        "movd    7(%0), %%mm5       \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm5     \n\t"\
+        "paddw %%mm3, %%mm2         \n\t"\
+        "paddw %%mm5, %%mm4         \n\t"\
+        "movq %5, %%mm5             \n\t"\
+        "paddw %%mm5, %%mm2         \n\t"\
+        "paddw %%mm5, %%mm4         \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm4, %%mm1         \n\t"\
+        "psraw $5, %%mm0            \n\t"\
+        "psraw $5, %%mm1            \n\t"\
+        "movq (%2), %%mm4           \n\t"\
+        "packuswb %%mm1, %%mm0      \n\t"\
+        PAVGB" %%mm4, %%mm0         \n\t"\
+        OP(%%mm0, (%1),%%mm5, q)\
+        "add %4, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "add %3, %2                 \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2)\
+        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+          "m"(ff_pw_16)\
+        : "memory"\
+    );\
+    }while(--h);\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    int w= 2;\
+    src -= 2*srcStride;\
+    \
+    while(w--){\
+      __asm__ volatile(\
+        "pxor %%mm7, %%mm7          \n\t"\
+        "movd (%0), %%mm0           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm1           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm2           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm3           \n\t"\
+        "add %2, %0                 \n\t"\
+        "movd (%0), %%mm4           \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%mm7, %%mm0     \n\t"\
+        "punpcklbw %%mm7, %%mm1     \n\t"\
+        "punpcklbw %%mm7, %%mm2     \n\t"\
+        "punpcklbw %%mm7, %%mm3     \n\t"\
+        "punpcklbw %%mm7, %%mm4     \n\t"\
+        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+        QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+        QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+        QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+        QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+        QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+        QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+         \
+        : "+a"(src), "+c"(dst)\
+        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+     );\
+     if(h==16){\
+        __asm__ volatile(\
+            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+            QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+            QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+            QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+            QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+            QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+            QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+            \
+           : "+a"(src), "+c"(dst)\
+           : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+           : "memory"\
+        );\
+     }\
+     src += 4-(h+5)*srcStride;\
+     dst += 4-h*dstStride;\
+   }\
+}\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
+    int w = (size+8)>>2;\
+    src -= 2*srcStride+2;\
+    while(w--){\
+        __asm__ volatile(\
+            "pxor %%mm7, %%mm7      \n\t"\
+            "movd (%0), %%mm0       \n\t"\
+            "add %2, %0             \n\t"\
+            "movd (%0), %%mm1       \n\t"\
+            "add %2, %0             \n\t"\
+            "movd (%0), %%mm2       \n\t"\
+            "add %2, %0             \n\t"\
+            "movd (%0), %%mm3       \n\t"\
+            "add %2, %0             \n\t"\
+            "movd (%0), %%mm4       \n\t"\
+            "add %2, %0             \n\t"\
+            "punpcklbw %%mm7, %%mm0 \n\t"\
+            "punpcklbw %%mm7, %%mm1 \n\t"\
+            "punpcklbw %%mm7, %%mm2 \n\t"\
+            "punpcklbw %%mm7, %%mm3 \n\t"\
+            "punpcklbw %%mm7, %%mm4 \n\t"\
+            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
+            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
+            QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
+            QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
+            QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
+            QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
+            QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
+            QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
+            : "+a"(src)\
+            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+            : "memory"\
+        );\
+        if(size==16){\
+            __asm__ volatile(\
+                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1,  8*48)\
+                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2,  9*48)\
+                QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
+                QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
+                QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
+                QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
+                QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
+                QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
+                : "+a"(src)\
+                : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+                : "memory"\
+            );\
+        }\
+        tmp += 4;\
+        src += 4 - (size+5)*srcStride;\
+    }\
+}\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+    int w = size>>4;\
+    do{\
+    int h = size;\
+    __asm__ volatile(\
+        "1:                         \n\t"\
+        "movq     (%0), %%mm0       \n\t"\
+        "movq    8(%0), %%mm3       \n\t"\
+        "movq    2(%0), %%mm1       \n\t"\
+        "movq   10(%0), %%mm4       \n\t"\
+        "paddw   %%mm4, %%mm0       \n\t"\
+        "paddw   %%mm3, %%mm1       \n\t"\
+        "paddw  18(%0), %%mm3       \n\t"\
+        "paddw  16(%0), %%mm4       \n\t"\
+        "movq    4(%0), %%mm2       \n\t"\
+        "movq   12(%0), %%mm5       \n\t"\
+        "paddw   6(%0), %%mm2       \n\t"\
+        "paddw  14(%0), %%mm5       \n\t"\
+        "psubw %%mm1, %%mm0         \n\t"\
+        "psubw %%mm4, %%mm3         \n\t"\
+        "psraw $2, %%mm0            \n\t"\
+        "psraw $2, %%mm3            \n\t"\
+        "psubw %%mm1, %%mm0         \n\t"\
+        "psubw %%mm4, %%mm3         \n\t"\
+        "paddsw %%mm2, %%mm0        \n\t"\
+        "paddsw %%mm5, %%mm3        \n\t"\
+        "psraw $2, %%mm0            \n\t"\
+        "psraw $2, %%mm3            \n\t"\
+        "paddw %%mm2, %%mm0         \n\t"\
+        "paddw %%mm5, %%mm3         \n\t"\
+        "psraw $6, %%mm0            \n\t"\
+        "psraw $6, %%mm3            \n\t"\
+        "packuswb %%mm3, %%mm0      \n\t"\
+        OP(%%mm0, (%1),%%mm7, q)\
+        "add $48, %0                \n\t"\
+        "add %3, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(tmp), "+c"(dst), "+g"(h)\
+        : "S"((x86_reg)dstStride)\
+        : "memory"\
+    );\
+    tmp += 8 - size*24;\
+    dst += 8 - size*dstStride;\
+    }while(w--);\
+}\
+\
+static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+    src += 8*dstStride;\
+    dst += 8*dstStride;\
+    src2 += 8*src2Stride;\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+          put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
+    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
+}\
+\
+static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
+}\
+\
+static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+    __asm__ volatile(\
+        "movq      (%1), %%mm0          \n\t"\
+        "movq    24(%1), %%mm1          \n\t"\
+        "psraw      $5,  %%mm0          \n\t"\
+        "psraw      $5,  %%mm1          \n\t"\
+        "packuswb %%mm0, %%mm0          \n\t"\
+        "packuswb %%mm1, %%mm1          \n\t"\
+        PAVGB"     (%0), %%mm0          \n\t"\
+        PAVGB"  (%0,%3), %%mm1          \n\t"\
+        OP(%%mm0, (%2),    %%mm4, d)\
+        OP(%%mm1, (%2,%4), %%mm5, d)\
+        "lea  (%0,%3,2), %0             \n\t"\
+        "lea  (%2,%4,2), %2             \n\t"\
+        "movq    48(%1), %%mm0          \n\t"\
+        "movq    72(%1), %%mm1          \n\t"\
+        "psraw      $5,  %%mm0          \n\t"\
+        "psraw      $5,  %%mm1          \n\t"\
+        "packuswb %%mm0, %%mm0          \n\t"\
+        "packuswb %%mm1, %%mm1          \n\t"\
+        PAVGB"     (%0), %%mm0          \n\t"\
+        PAVGB"  (%0,%3), %%mm1          \n\t"\
+        OP(%%mm0, (%2),    %%mm4, d)\
+        OP(%%mm1, (%2,%4), %%mm5, d)\
+        :"+a"(src8), "+c"(src16), "+d"(dst)\
+        :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
+        :"memory");\
+}\
+static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+    do{\
+    __asm__ volatile(\
+        "movq      (%1), %%mm0          \n\t"\
+        "movq     8(%1), %%mm1          \n\t"\
+        "movq    48(%1), %%mm2          \n\t"\
+        "movq  8+48(%1), %%mm3          \n\t"\
+        "psraw      $5,  %%mm0          \n\t"\
+        "psraw      $5,  %%mm1          \n\t"\
+        "psraw      $5,  %%mm2          \n\t"\
+        "psraw      $5,  %%mm3          \n\t"\
+        "packuswb %%mm1, %%mm0          \n\t"\
+        "packuswb %%mm3, %%mm2          \n\t"\
+        PAVGB"     (%0), %%mm0          \n\t"\
+        PAVGB"  (%0,%3), %%mm2          \n\t"\
+        OP(%%mm0, (%2), %%mm5, q)\
+        OP(%%mm2, (%2,%4), %%mm5, q)\
+        ::"a"(src8), "c"(src16), "d"(dst),\
+          "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
+        :"memory");\
+        src8 += 2L*src8Stride;\
+        src16 += 48;\
+        dst += 2L*dstStride;\
+    }while(h-=2);\
+}\
+static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+    OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
+    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
+}\
+
+
+#ifdef ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=16;\
+    __asm__ volatile(\
+        "pxor %%xmm15, %%xmm15      \n\t"\
+        "movdqa %6, %%xmm14         \n\t"\
+        "movdqa %7, %%xmm13         \n\t"\
+        "1:                         \n\t"\
+        "lddqu    3(%0), %%xmm1     \n\t"\
+        "lddqu   -5(%0), %%xmm7     \n\t"\
+        "movdqa  %%xmm1, %%xmm0     \n\t"\
+        "punpckhbw %%xmm15, %%xmm1  \n\t"\
+        "punpcklbw %%xmm15, %%xmm0  \n\t"\
+        "punpcklbw %%xmm15, %%xmm7  \n\t"\
+        "movdqa  %%xmm1, %%xmm2     \n\t"\
+        "movdqa  %%xmm0, %%xmm6     \n\t"\
+        "movdqa  %%xmm1, %%xmm3     \n\t"\
+        "movdqa  %%xmm0, %%xmm8     \n\t"\
+        "movdqa  %%xmm1, %%xmm4     \n\t"\
+        "movdqa  %%xmm0, %%xmm9     \n\t"\
+        "movdqa  %%xmm1, %%xmm5     \n\t"\
+        "movdqa  %%xmm0, %%xmm10    \n\t"\
+        "palignr $6, %%xmm0, %%xmm5 \n\t"\
+        "palignr $6, %%xmm7, %%xmm10\n\t"\
+        "palignr $8, %%xmm0, %%xmm4 \n\t"\
+        "palignr $8, %%xmm7, %%xmm9 \n\t"\
+        "palignr $10,%%xmm0, %%xmm3 \n\t"\
+        "palignr $10,%%xmm7, %%xmm8 \n\t"\
+        "paddw   %%xmm1, %%xmm5     \n\t"\
+        "paddw   %%xmm0, %%xmm10    \n\t"\
+        "palignr $12,%%xmm0, %%xmm2 \n\t"\
+        "palignr $12,%%xmm7, %%xmm6 \n\t"\
+        "palignr $14,%%xmm0, %%xmm1 \n\t"\
+        "palignr $14,%%xmm7, %%xmm0 \n\t"\
+        "paddw   %%xmm3, %%xmm2     \n\t"\
+        "paddw   %%xmm8, %%xmm6     \n\t"\
+        "paddw   %%xmm4, %%xmm1     \n\t"\
+        "paddw   %%xmm9, %%xmm0     \n\t"\
+        "psllw   $2,     %%xmm2     \n\t"\
+        "psllw   $2,     %%xmm6     \n\t"\
+        "psubw   %%xmm1, %%xmm2     \n\t"\
+        "psubw   %%xmm0, %%xmm6     \n\t"\
+        "paddw   %%xmm13,%%xmm5     \n\t"\
+        "paddw   %%xmm13,%%xmm10    \n\t"\
+        "pmullw  %%xmm14,%%xmm2     \n\t"\
+        "pmullw  %%xmm14,%%xmm6     \n\t"\
+        "lddqu   (%2),   %%xmm3     \n\t"\
+        "paddw   %%xmm5, %%xmm2     \n\t"\
+        "paddw   %%xmm10,%%xmm6     \n\t"\
+        "psraw   $5,     %%xmm2     \n\t"\
+        "psraw   $5,     %%xmm6     \n\t"\
+        "packuswb %%xmm2,%%xmm6     \n\t"\
+        "pavgb   %%xmm3, %%xmm6     \n\t"\
+        OP(%%xmm6, (%1), %%xmm4, dqa)\
+        "add %5, %0                 \n\t"\
+        "add %5, %1                 \n\t"\
+        "add %4, %2                 \n\t"\
+        "decl %3                    \n\t"\
+        "jg 1b                      \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
+        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+          "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}
+#else // ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+    src += 8*dstStride;\
+    dst += 8*dstStride;\
+    src2 += 8*src2Stride;\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}
+#endif // ARCH_X86_64
+
+#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=8;\
+    __asm__ volatile(\
+        "pxor %%xmm7, %%xmm7        \n\t"\
+        "movdqa %0, %%xmm6          \n\t"\
+        :: "m"(ff_pw_5)\
+    );\
+    do{\
+    __asm__ volatile(\
+        "lddqu   -5(%0), %%xmm1     \n\t"\
+        "movdqa  %%xmm1, %%xmm0     \n\t"\
+        "punpckhbw %%xmm7, %%xmm1   \n\t"\
+        "punpcklbw %%xmm7, %%xmm0   \n\t"\
+        "movdqa  %%xmm1, %%xmm2     \n\t"\
+        "movdqa  %%xmm1, %%xmm3     \n\t"\
+        "movdqa  %%xmm1, %%xmm4     \n\t"\
+        "movdqa  %%xmm1, %%xmm5     \n\t"\
+        "palignr $6, %%xmm0, %%xmm5 \n\t"\
+        "palignr $8, %%xmm0, %%xmm4 \n\t"\
+        "palignr $10,%%xmm0, %%xmm3 \n\t"\
+        "paddw   %%xmm1, %%xmm5     \n\t"\
+        "palignr $12,%%xmm0, %%xmm2 \n\t"\
+        "palignr $14,%%xmm0, %%xmm1 \n\t"\
+        "paddw   %%xmm3, %%xmm2     \n\t"\
+        "paddw   %%xmm4, %%xmm1     \n\t"\
+        "psllw   $2,     %%xmm2     \n\t"\
+        "movq    (%2),   %%xmm3     \n\t"\
+        "psubw   %%xmm1, %%xmm2     \n\t"\
+        "paddw   %5,     %%xmm5     \n\t"\
+        "pmullw  %%xmm6, %%xmm2     \n\t"\
+        "paddw   %%xmm5, %%xmm2     \n\t"\
+        "psraw   $5,     %%xmm2     \n\t"\
+        "packuswb %%xmm2, %%xmm2    \n\t"\
+        "pavgb   %%xmm3, %%xmm2     \n\t"\
+        OP(%%xmm2, (%1), %%xmm4, q)\
+        "add %4, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "add %3, %2                 \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2)\
+        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+          "m"(ff_pw_16)\
+        : "memory"\
+    );\
+    }while(--h);\
+}\
+QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    int h=8;\
+    __asm__ volatile(\
+        "pxor %%xmm7, %%xmm7        \n\t"\
+        "movdqa %5, %%xmm6          \n\t"\
+        "1:                         \n\t"\
+        "lddqu   -5(%0), %%xmm1     \n\t"\
+        "movdqa  %%xmm1, %%xmm0     \n\t"\
+        "punpckhbw %%xmm7, %%xmm1   \n\t"\
+        "punpcklbw %%xmm7, %%xmm0   \n\t"\
+        "movdqa  %%xmm1, %%xmm2     \n\t"\
+        "movdqa  %%xmm1, %%xmm3     \n\t"\
+        "movdqa  %%xmm1, %%xmm4     \n\t"\
+        "movdqa  %%xmm1, %%xmm5     \n\t"\
+        "palignr $6, %%xmm0, %%xmm5 \n\t"\
+        "palignr $8, %%xmm0, %%xmm4 \n\t"\
+        "palignr $10,%%xmm0, %%xmm3 \n\t"\
+        "paddw   %%xmm1, %%xmm5     \n\t"\
+        "palignr $12,%%xmm0, %%xmm2 \n\t"\
+        "palignr $14,%%xmm0, %%xmm1 \n\t"\
+        "paddw   %%xmm3, %%xmm2     \n\t"\
+        "paddw   %%xmm4, %%xmm1     \n\t"\
+        "psllw   $2,     %%xmm2     \n\t"\
+        "psubw   %%xmm1, %%xmm2     \n\t"\
+        "paddw   %6,     %%xmm5     \n\t"\
+        "pmullw  %%xmm6, %%xmm2     \n\t"\
+        "paddw   %%xmm5, %%xmm2     \n\t"\
+        "psraw   $5,     %%xmm2     \n\t"\
+        "packuswb %%xmm2, %%xmm2    \n\t"\
+        OP(%%xmm2, (%1), %%xmm4, q)\
+        "add %3, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(src), "+c"(dst), "+g"(h)\
+        : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
+          "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}\
+static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    src -= 2*srcStride;\
+    \
+    __asm__ volatile(\
+        "pxor %%xmm7, %%xmm7        \n\t"\
+        "movq (%0), %%xmm0          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm1          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm2          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm3          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm4          \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%xmm7, %%xmm0   \n\t"\
+        "punpcklbw %%xmm7, %%xmm1   \n\t"\
+        "punpcklbw %%xmm7, %%xmm2   \n\t"\
+        "punpcklbw %%xmm7, %%xmm3   \n\t"\
+        "punpcklbw %%xmm7, %%xmm4   \n\t"\
+        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
+        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
+        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+         \
+        : "+a"(src), "+c"(dst)\
+        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+    if(h==16){\
+        __asm__ volatile(\
+            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+            QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
+            QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
+            QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+            QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+            \
+            : "+a"(src), "+c"(dst)\
+            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+            : "memory"\
+        );\
+    }\
+}\
+static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}
+
+static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
+    int w = (size+8)>>3;
+    src -= 2*srcStride+2;
+    while(w--){
+        __asm__ volatile(
+            "pxor %%xmm7, %%xmm7        \n\t"
+            "movq (%0), %%xmm0          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm1          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm2          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm3          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm4          \n\t"
+            "add %2, %0                 \n\t"
+            "punpcklbw %%xmm7, %%xmm0   \n\t"
+            "punpcklbw %%xmm7, %%xmm1   \n\t"
+            "punpcklbw %%xmm7, %%xmm2   \n\t"
+            "punpcklbw %%xmm7, %%xmm3   \n\t"
+            "punpcklbw %%xmm7, %%xmm4   \n\t"
+            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
+            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
+            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
+            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
+            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
+            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
+            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
+            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
+            : "+a"(src)
+            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
+            : "memory"
+        );
+        if(size==16){
+            __asm__ volatile(
+                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
+                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
+                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
+                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
+                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
+                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
+                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
+                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
+                : "+a"(src)
+                : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
+                : "memory"
+            );
+        }
+        tmp += 8;
+        src += 8 - (size+5)*srcStride;
+    }
+}
+
+#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+    int h = size;\
+    if(size == 16){\
+        __asm__ volatile(\
+            "1:                         \n\t"\
+            "movdqa 32(%0), %%xmm4      \n\t"\
+            "movdqa 16(%0), %%xmm5      \n\t"\
+            "movdqa   (%0), %%xmm7      \n\t"\
+            "movdqa %%xmm4, %%xmm3      \n\t"\
+            "movdqa %%xmm4, %%xmm2      \n\t"\
+            "movdqa %%xmm4, %%xmm1      \n\t"\
+            "movdqa %%xmm4, %%xmm0      \n\t"\
+            "palignr $10, %%xmm5, %%xmm0 \n\t"\
+            "palignr  $8, %%xmm5, %%xmm1 \n\t"\
+            "palignr  $6, %%xmm5, %%xmm2 \n\t"\
+            "palignr  $4, %%xmm5, %%xmm3 \n\t"\
+            "palignr  $2, %%xmm5, %%xmm4 \n\t"\
+            "paddw  %%xmm5, %%xmm0      \n\t"\
+            "paddw  %%xmm4, %%xmm1      \n\t"\
+            "paddw  %%xmm3, %%xmm2      \n\t"\
+            "movdqa %%xmm5, %%xmm6      \n\t"\
+            "movdqa %%xmm5, %%xmm4      \n\t"\
+            "movdqa %%xmm5, %%xmm3      \n\t"\
+            "palignr  $8, %%xmm7, %%xmm4 \n\t"\
+            "palignr  $2, %%xmm7, %%xmm6 \n\t"\
+            "palignr $10, %%xmm7, %%xmm3 \n\t"\
+            "paddw  %%xmm6, %%xmm4      \n\t"\
+            "movdqa %%xmm5, %%xmm6      \n\t"\
+            "palignr  $6, %%xmm7, %%xmm5 \n\t"\
+            "palignr  $4, %%xmm7, %%xmm6 \n\t"\
+            "paddw  %%xmm7, %%xmm3      \n\t"\
+            "paddw  %%xmm6, %%xmm5      \n\t"\
+            \
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "psubw  %%xmm4, %%xmm3      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm3      \n\t"\
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "psubw  %%xmm4, %%xmm3      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "paddw  %%xmm5, %%xmm3      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm3      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "paddw  %%xmm5, %%xmm3      \n\t"\
+            "psraw      $6, %%xmm0      \n\t"\
+            "psraw      $6, %%xmm3      \n\t"\
+            "packuswb %%xmm0, %%xmm3    \n\t"\
+            OP(%%xmm3, (%1), %%xmm7, dqa)\
+            "add $48, %0                \n\t"\
+            "add %3, %1                 \n\t"\
+            "decl %2                    \n\t"\
+            " jnz 1b                    \n\t"\
+            : "+a"(tmp), "+c"(dst), "+g"(h)\
+            : "S"((x86_reg)dstStride)\
+            : "memory"\
+        );\
+    }else{\
+        __asm__ volatile(\
+            "1:                         \n\t"\
+            "movdqa 16(%0), %%xmm1      \n\t"\
+            "movdqa   (%0), %%xmm0      \n\t"\
+            "movdqa %%xmm1, %%xmm2      \n\t"\
+            "movdqa %%xmm1, %%xmm3      \n\t"\
+            "movdqa %%xmm1, %%xmm4      \n\t"\
+            "movdqa %%xmm1, %%xmm5      \n\t"\
+            "palignr $10, %%xmm0, %%xmm5 \n\t"\
+            "palignr  $8, %%xmm0, %%xmm4 \n\t"\
+            "palignr  $6, %%xmm0, %%xmm3 \n\t"\
+            "palignr  $4, %%xmm0, %%xmm2 \n\t"\
+            "palignr  $2, %%xmm0, %%xmm1 \n\t"\
+            "paddw  %%xmm5, %%xmm0      \n\t"\
+            "paddw  %%xmm4, %%xmm1      \n\t"\
+            "paddw  %%xmm3, %%xmm2      \n\t"\
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "psraw      $6, %%xmm0      \n\t"\
+            "packuswb %%xmm0, %%xmm0    \n\t"\
+            OP(%%xmm0, (%1), %%xmm7, q)\
+            "add $48, %0                \n\t"\
+            "add %3, %1                 \n\t"\
+            "decl %2                    \n\t"\
+            " jnz 1b                    \n\t"\
+            : "+a"(tmp), "+c"(dst), "+g"(h)\
+            : "S"((x86_reg)dstStride)\
+            : "memory"\
+        );\
+    }\
+}
+
+#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
+    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
+}\
+static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
+}\
+
+#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
+#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
+#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
+#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
+#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
+#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
+#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
+#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
+
+#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
+#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
+#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
+#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
+#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
+#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
+#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
+#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
+
+#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
+#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
+#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
+#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
+
+#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
+#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
+#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
+#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
+
+#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
+#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
+
+#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
+H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
+
+static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
+    put_pixels16_sse2(dst, src, stride, 16);
+}
+static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
+    avg_pixels16_sse2(dst, src, stride, 16);
+}
+#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
+#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
+
+#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
+}\
+
+#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
+}\
+
+#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
+}\
+
+#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\
+    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
+}\
+
+#define H264_MC_4816(MMX)\
+H264_MC(put_, 4, MMX, 8)\
+H264_MC(put_, 8, MMX, 8)\
+H264_MC(put_, 16,MMX, 8)\
+H264_MC(avg_, 4, MMX, 8)\
+H264_MC(avg_, 8, MMX, 8)\
+H264_MC(avg_, 16,MMX, 8)\
+
+#define H264_MC_816(QPEL, XMM)\
+QPEL(put_, 8, XMM, 16)\
+QPEL(put_, 16,XMM, 16)\
+QPEL(avg_, 8, XMM, 16)\
+QPEL(avg_, 16,XMM, 16)\
+
+
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgusb " #temp ", " #a "        \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp "   \n\t"\
+"pavgb " #temp ", " #a "          \n\t"\
+"mov" #size " " #a ", " #b "      \n\t"
+
+#define PAVGB "pavgusb"
+QPEL_H264(put_,       PUT_OP, 3dnow)
+QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
+#undef PAVGB
+#define PAVGB "pavgb"
+QPEL_H264(put_,       PUT_OP, mmx2)
+QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
+QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
+QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
+QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
+QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
+#ifdef HAVE_SSSE3
+QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
+QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
+QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
+#endif
+#undef PAVGB
+
+H264_MC_4816(3dnow)
+H264_MC_4816(mmx2)
+H264_MC_816(H264_MC_V, sse2)
+H264_MC_816(H264_MC_HV, sse2)
+#ifdef HAVE_SSSE3
+H264_MC_816(H264_MC_H, ssse3)
+H264_MC_816(H264_MC_HV, ssse3)
+#endif
+
+/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
+DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg[4]) = {
+    0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
+};
+
+#define H264_CHROMA_OP(S,D)
+#define H264_CHROMA_OP4(S,D,T)
+#define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx
+#define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx
+#define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
+#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
+#include "dsputil_h264_template_mmx.c"
+
+static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
+}
+static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2);
+}
+static void put_h264_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
+}
+
+#undef H264_CHROMA_OP
+#undef H264_CHROMA_OP4
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC2_TMPL
+#undef H264_CHROMA_MC8_MV0
+
+#define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
+#define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
+                               "pavgb " #T ", " #D " \n\t"
+#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2
+#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2
+#define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
+#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
+#include "dsputil_h264_template_mmx.c"
+static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
+}
+static void avg_h264_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
+}
+#undef H264_CHROMA_OP
+#undef H264_CHROMA_OP4
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC2_TMPL
+#undef H264_CHROMA_MC8_MV0
+
+#define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
+#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
+                               "pavgusb " #T ", " #D " \n\t"
+#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow
+#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow
+#define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
+#include "dsputil_h264_template_mmx.c"
+static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
+}
+static void avg_h264_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
+}
+#undef H264_CHROMA_OP
+#undef H264_CHROMA_OP4
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC8_MV0
+
+#ifdef HAVE_SSSE3
+#define AVG_OP(X)
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
+#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
+#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
+#include "dsputil_h264_template_ssse3.c"
+static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
+}
+static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
+}
+
+#undef AVG_OP
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC8_MV0
+#define AVG_OP(X) X
+#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
+#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
+#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
+#include "dsputil_h264_template_ssse3.c"
+static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
+}
+#undef AVG_OP
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC8_MV0
+#endif
+
+/***********************************/
+/* weighted prediction */
+
+static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
+{
+    int x, y;
+    offset <<= log2_denom;
+    offset += (1 << log2_denom) >> 1;
+    __asm__ volatile(
+        "movd    %0, %%mm4        \n\t"
+        "movd    %1, %%mm5        \n\t"
+        "movd    %2, %%mm6        \n\t"
+        "pshufw  $0, %%mm4, %%mm4 \n\t"
+        "pshufw  $0, %%mm5, %%mm5 \n\t"
+        "pxor    %%mm7, %%mm7     \n\t"
+        :: "g"(weight), "g"(offset), "g"(log2_denom)
+    );
+    for(y=0; y<h; y+=2){
+        for(x=0; x<w; x+=4){
+            __asm__ volatile(
+                "movd      %0,    %%mm0 \n\t"
+                "movd      %1,    %%mm1 \n\t"
+                "punpcklbw %%mm7, %%mm0 \n\t"
+                "punpcklbw %%mm7, %%mm1 \n\t"
+                "pmullw    %%mm4, %%mm0 \n\t"
+                "pmullw    %%mm4, %%mm1 \n\t"
+                "paddsw    %%mm5, %%mm0 \n\t"
+                "paddsw    %%mm5, %%mm1 \n\t"
+                "psraw     %%mm6, %%mm0 \n\t"
+                "psraw     %%mm6, %%mm1 \n\t"
+                "packuswb  %%mm7, %%mm0 \n\t"
+                "packuswb  %%mm7, %%mm1 \n\t"
+                "movd      %%mm0, %0    \n\t"
+                "movd      %%mm1, %1    \n\t"
+                : "+m"(*(uint32_t*)(dst+x)),
+                  "+m"(*(uint32_t*)(dst+x+stride))
+            );
+        }
+        dst += 2*stride;
+    }
+}
+
+static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
+{
+    int x, y;
+    offset = ((offset + 1) | 1) << log2_denom;
+    __asm__ volatile(
+        "movd    %0, %%mm3        \n\t"
+        "movd    %1, %%mm4        \n\t"
+        "movd    %2, %%mm5        \n\t"
+        "movd    %3, %%mm6        \n\t"
+        "pshufw  $0, %%mm3, %%mm3 \n\t"
+        "pshufw  $0, %%mm4, %%mm4 \n\t"
+        "pshufw  $0, %%mm5, %%mm5 \n\t"
+        "pxor    %%mm7, %%mm7     \n\t"
+        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
+    );
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x+=4){
+            __asm__ volatile(
+                "movd      %0,    %%mm0 \n\t"
+                "movd      %1,    %%mm1 \n\t"
+                "punpcklbw %%mm7, %%mm0 \n\t"
+                "punpcklbw %%mm7, %%mm1 \n\t"
+                "pmullw    %%mm3, %%mm0 \n\t"
+                "pmullw    %%mm4, %%mm1 \n\t"
+                "paddsw    %%mm1, %%mm0 \n\t"
+                "paddsw    %%mm5, %%mm0 \n\t"
+                "psraw     %%mm6, %%mm0 \n\t"
+                "packuswb  %%mm0, %%mm0 \n\t"
+                "movd      %%mm0, %0    \n\t"
+                : "+m"(*(uint32_t*)(dst+x))
+                :  "m"(*(uint32_t*)(src+x))
+            );
+        }
+        src += stride;
+        dst += stride;
+    }
+}
+
+#define H264_WEIGHT(W,H) \
+static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+    ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+} \
+static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
+    ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16, 8)
+H264_WEIGHT( 8,16)
+H264_WEIGHT( 8, 8)
+H264_WEIGHT( 8, 4)
+H264_WEIGHT( 4, 8)
+H264_WEIGHT( 4, 4)
+H264_WEIGHT( 4, 2)
+
diff --git a/libavcodec/i386/idct_mmx.c b/libavcodec/x86/idct_mmx.c
similarity index 100%
rename from libavcodec/i386/idct_mmx.c
rename to libavcodec/x86/idct_mmx.c
diff --git a/libavcodec/i386/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
similarity index 100%
rename from libavcodec/i386/idct_mmx_xvid.c
rename to libavcodec/x86/idct_mmx_xvid.c
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
new file mode 100644
index 0000000..d8711a2
--- /dev/null
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -0,0 +1,394 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ * - SSE2 inverse discrete cosine transform -
+ *
+ * Copyright(C) 2003 Pascal Massimino <skal at planet-d.net>
+ *
+ * Conversion to gcc syntax with modifications
+ * by Alexander Strange <astrange at ithinksw.com>
+ *
+ * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+ *
+ * This file is part of FFmpeg.
+ *
+ * Vertical pass is an implementation of the scheme:
+ *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
+ *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+ *  Proc. ICASSP 1989, 988-991.
+ *
+ * Horizontal pass is a double 4x4 vector/matrix multiplication,
+ * (see also Intel's Application Note 922:
+ *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+ *  Copyright (C) 1999 Intel Corporation)
+ *
+ * More details at http://skal.planet-d.net/coding/dct.html
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+#include "idct_xvid.h"
+
+/*!
+ * @file idct_sse2_xvid.c
+ * @brief SSE2 idct compatible with xvidmmx
+ */
+
+#define X8(x)     x,x,x,x,x,x,x,x
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 6
+
+DECLARE_ASM_CONST(16, int16_t, tan1[]) = {X8(13036)}; // tan( pi/16)
+DECLARE_ASM_CONST(16, int16_t, tan2[]) = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
+DECLARE_ASM_CONST(16, int16_t, tan3[]) = {X8(43790)}; // tan(3pi/16)-1
+DECLARE_ASM_CONST(16, int16_t, sqrt2[])= {X8(23170)}; // 0.5/sqrt(2)
+DECLARE_ASM_CONST(8,  uint8_t, m127[]) = {X8(127)};
+
+DECLARE_ASM_CONST(16, int16_t, iTab1[]) = {
+ 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
+ 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
+ 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
+ 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab2[]) = {
+ 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
+ 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
+ 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
+ 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab3[]) = {
+ 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
+ 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
+ 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
+ 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab4[]) = {
+ 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
+ 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
+ 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
+ 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
+};
+
+DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders[]) = {
+ 65536, 65536, 65536, 65536,
+  3597,  3597,  3597,  3597,
+  2260,  2260,  2260,  2260,
+  1203,  1203,  1203,  1203,
+   120,   120,   120,   120,
+   512,   512,   512,   512
+};
+
+// Temporary storage before the column pass
+#define ROW1 "%%xmm6"
+#define ROW3 "%%xmm4"
+#define ROW5 "%%xmm5"
+#define ROW7 "%%xmm7"
+
+#define CLEAR_ODD(r) "pxor  "r","r" \n\t"
+#define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
+
+#ifdef ARCH_X86_64
+
+# define ROW0 "%%xmm8"
+# define REG0 ROW0
+# define ROW2 "%%xmm9"
+# define REG2 ROW2
+# define ROW4 "%%xmm10"
+# define REG4 ROW4
+# define ROW6 "%%xmm11"
+# define REG6 ROW6
+# define CLEAR_EVEN(r) CLEAR_ODD(r)
+# define PUT_EVEN(dst) PUT_ODD(dst)
+# define XMMS "%%xmm12"
+# define MOV_32_ONLY "#"
+# define SREG2 REG2
+# define TAN3 "%%xmm13"
+# define TAN1 "%%xmm14"
+
+#else
+
+# define ROW0 "(%0)"
+# define REG0 "%%xmm4"
+# define ROW2 "2*16(%0)"
+# define REG2 "%%xmm4"
+# define ROW4 "4*16(%0)"
+# define REG4 "%%xmm6"
+# define ROW6 "6*16(%0)"
+# define REG6 "%%xmm6"
+# define CLEAR_EVEN(r)
+# define PUT_EVEN(dst) \
+    "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
+    "movdqa          %%xmm2, "dst"    \n\t"
+# define XMMS "%%xmm2"
+# define MOV_32_ONLY "movdqa "
+# define SREG2 "%%xmm7"
+# define TAN3 "%%xmm0"
+# define TAN1 "%%xmm2"
+
+#endif
+
+#define ROUND(x) "paddd   "MANGLE(x)
+
+#define JZ(reg, to)                         \
+    "testl     "reg","reg"            \n\t" \
+    "jz        "to"                   \n\t"
+
+#define JNZ(reg, to)                        \
+    "testl     "reg","reg"            \n\t" \
+    "jnz       "to"                   \n\t"
+
+#define TEST_ONE_ROW(src, reg, clear)       \
+    clear                                   \
+    "movq     "src", %%mm1            \n\t" \
+    "por    8+"src", %%mm1            \n\t" \
+    "paddusb  %%mm0, %%mm1            \n\t" \
+    "pmovmskb %%mm1, "reg"            \n\t"
+
+#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
+    clear1                                  \
+    clear2                                  \
+    "movq     "row1", %%mm1           \n\t" \
+    "por    8+"row1", %%mm1           \n\t" \
+    "movq     "row2", %%mm2           \n\t" \
+    "por    8+"row2", %%mm2           \n\t" \
+    "paddusb   %%mm0, %%mm1           \n\t" \
+    "paddusb   %%mm0, %%mm2           \n\t" \
+    "pmovmskb  %%mm1, "reg1"          \n\t" \
+    "pmovmskb  %%mm2, "reg2"          \n\t"
+
+///IDCT pass on rows.
+#define iMTX_MULT(src, table, rounder, put) \
+    "movdqa        "src", %%xmm3      \n\t" \
+    "movdqa       %%xmm3, %%xmm0      \n\t" \
+    "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
+    "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
+    "pmaddwd     "table", %%xmm0      \n\t" \
+    "pmaddwd  16+"table", %%xmm1      \n\t" \
+    "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
+    "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
+    "pmaddwd  32+"table", %%xmm2      \n\t" \
+    "pmaddwd  48+"table", %%xmm3      \n\t" \
+    "paddd        %%xmm1, %%xmm0      \n\t" \
+    "paddd        %%xmm3, %%xmm2      \n\t" \
+    rounder",     %%xmm0              \n\t" \
+    "movdqa       %%xmm2, %%xmm3      \n\t" \
+    "paddd        %%xmm0, %%xmm2      \n\t" \
+    "psubd        %%xmm3, %%xmm0      \n\t" \
+    "psrad           $11, %%xmm2      \n\t" \
+    "psrad           $11, %%xmm0      \n\t" \
+    "packssdw     %%xmm0, %%xmm2      \n\t" \
+    put                                     \
+    "1:                               \n\t"
+
+#define iLLM_HEAD                           \
+    "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
+    "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
+
+///IDCT pass on columns.
+#define iLLM_PASS(dct)                      \
+    "movdqa   "TAN3", %%xmm1          \n\t" \
+    "movdqa   "TAN1", %%xmm3          \n\t" \
+    "pmulhw   %%xmm4, "TAN3"          \n\t" \
+    "pmulhw   %%xmm5, %%xmm1          \n\t" \
+    "paddsw   %%xmm4, "TAN3"          \n\t" \
+    "paddsw   %%xmm5, %%xmm1          \n\t" \
+    "psubsw   %%xmm5, "TAN3"          \n\t" \
+    "paddsw   %%xmm4, %%xmm1          \n\t" \
+    "pmulhw   %%xmm7, %%xmm3          \n\t" \
+    "pmulhw   %%xmm6, "TAN1"          \n\t" \
+    "paddsw   %%xmm6, %%xmm3          \n\t" \
+    "psubsw   %%xmm7, "TAN1"          \n\t" \
+    "movdqa   %%xmm3, %%xmm7          \n\t" \
+    "movdqa   "TAN1", %%xmm6          \n\t" \
+    "psubsw   %%xmm1, %%xmm3          \n\t" \
+    "psubsw   "TAN3", "TAN1"          \n\t" \
+    "paddsw   %%xmm7, %%xmm1          \n\t" \
+    "paddsw   %%xmm6, "TAN3"          \n\t" \
+    "movdqa   %%xmm3, %%xmm6          \n\t" \
+    "psubsw   "TAN3", %%xmm3          \n\t" \
+    "paddsw   %%xmm6, "TAN3"          \n\t" \
+    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
+    "pmulhw   %%xmm4, %%xmm3          \n\t" \
+    "pmulhw   %%xmm4, "TAN3"          \n\t" \
+    "paddsw   "TAN3", "TAN3"          \n\t" \
+    "paddsw   %%xmm3, %%xmm3          \n\t" \
+    "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
+    MOV_32_ONLY ROW2", "REG2"         \n\t" \
+    MOV_32_ONLY ROW6", "REG6"         \n\t" \
+    "movdqa   %%xmm7, %%xmm5          \n\t" \
+    "pmulhw   "REG6", %%xmm7          \n\t" \
+    "pmulhw   "REG2", %%xmm5          \n\t" \
+    "paddsw   "REG2", %%xmm7          \n\t" \
+    "psubsw   "REG6", %%xmm5          \n\t" \
+    MOV_32_ONLY ROW0", "REG0"         \n\t" \
+    MOV_32_ONLY ROW4", "REG4"         \n\t" \
+    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
+    "movdqa   "REG0", "XMMS"          \n\t" \
+    "psubsw   "REG4", "REG0"          \n\t" \
+    "paddsw   "XMMS", "REG4"          \n\t" \
+    "movdqa   "REG4", "XMMS"          \n\t" \
+    "psubsw   %%xmm7, "REG4"          \n\t" \
+    "paddsw   "XMMS", %%xmm7          \n\t" \
+    "movdqa   "REG0", "XMMS"          \n\t" \
+    "psubsw   %%xmm5, "REG0"          \n\t" \
+    "paddsw   "XMMS", %%xmm5          \n\t" \
+    "movdqa   %%xmm5, "XMMS"          \n\t" \
+    "psubsw   "TAN3", %%xmm5          \n\t" \
+    "paddsw   "XMMS", "TAN3"          \n\t" \
+    "movdqa   "REG0", "XMMS"          \n\t" \
+    "psubsw   %%xmm3, "REG0"          \n\t" \
+    "paddsw   "XMMS", %%xmm3          \n\t" \
+    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
+    "psraw        $6, %%xmm5          \n\t" \
+    "psraw        $6, "REG0"          \n\t" \
+    "psraw        $6, "TAN3"          \n\t" \
+    "psraw        $6, %%xmm3          \n\t" \
+    "movdqa   "TAN3", 1*16("dct")     \n\t" \
+    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
+    "movdqa   "REG0", 5*16("dct")     \n\t" \
+    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
+    "movdqa   %%xmm7, %%xmm0          \n\t" \
+    "movdqa   "REG4", %%xmm4          \n\t" \
+    "psubsw   %%xmm1, %%xmm7          \n\t" \
+    "psubsw   "TAN1", "REG4"          \n\t" \
+    "paddsw   %%xmm0, %%xmm1          \n\t" \
+    "paddsw   %%xmm4, "TAN1"          \n\t" \
+    "psraw        $6, %%xmm1          \n\t" \
+    "psraw        $6, %%xmm7          \n\t" \
+    "psraw        $6, "TAN1"          \n\t" \
+    "psraw        $6, "REG4"          \n\t" \
+    "movdqa   %%xmm1, ("dct")         \n\t" \
+    "movdqa   "TAN1", 3*16("dct")     \n\t" \
+    "movdqa   "REG4", 4*16("dct")     \n\t" \
+    "movdqa   %%xmm7, 7*16("dct")     \n\t"
+
+///IDCT pass on columns, assuming rows 4-7 are zero.
+#define iLLM_PASS_SPARSE(dct)               \
+    "pmulhw   %%xmm4, "TAN3"          \n\t" \
+    "paddsw   %%xmm4, "TAN3"          \n\t" \
+    "movdqa   %%xmm6, %%xmm3          \n\t" \
+    "pmulhw   %%xmm6, "TAN1"          \n\t" \
+    "movdqa   %%xmm4, %%xmm1          \n\t" \
+    "psubsw   %%xmm1, %%xmm3          \n\t" \
+    "paddsw   %%xmm6, %%xmm1          \n\t" \
+    "movdqa   "TAN1", %%xmm6          \n\t" \
+    "psubsw   "TAN3", "TAN1"          \n\t" \
+    "paddsw   %%xmm6, "TAN3"          \n\t" \
+    "movdqa   %%xmm3, %%xmm6          \n\t" \
+    "psubsw   "TAN3", %%xmm3          \n\t" \
+    "paddsw   %%xmm6, "TAN3"          \n\t" \
+    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
+    "pmulhw   %%xmm4, %%xmm3          \n\t" \
+    "pmulhw   %%xmm4, "TAN3"          \n\t" \
+    "paddsw   "TAN3", "TAN3"          \n\t" \
+    "paddsw   %%xmm3, %%xmm3          \n\t" \
+    "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
+    MOV_32_ONLY ROW2", "SREG2"        \n\t" \
+    "pmulhw   "SREG2", %%xmm5         \n\t" \
+    MOV_32_ONLY ROW0", "REG0"         \n\t" \
+    "movdqa   "REG0", %%xmm6          \n\t" \
+    "psubsw   "SREG2", %%xmm6         \n\t" \
+    "paddsw   "REG0", "SREG2"         \n\t" \
+    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
+    "movdqa   "REG0", "XMMS"          \n\t" \
+    "psubsw   %%xmm5, "REG0"          \n\t" \
+    "paddsw   "XMMS", %%xmm5          \n\t" \
+    "movdqa   %%xmm5, "XMMS"          \n\t" \
+    "psubsw   "TAN3", %%xmm5          \n\t" \
+    "paddsw   "XMMS", "TAN3"          \n\t" \
+    "movdqa   "REG0", "XMMS"          \n\t" \
+    "psubsw   %%xmm3, "REG0"          \n\t" \
+    "paddsw   "XMMS", %%xmm3          \n\t" \
+    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
+    "psraw        $6, %%xmm5          \n\t" \
+    "psraw        $6, "REG0"          \n\t" \
+    "psraw        $6, "TAN3"          \n\t" \
+    "psraw        $6, %%xmm3          \n\t" \
+    "movdqa   "TAN3", 1*16("dct")     \n\t" \
+    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
+    "movdqa   "REG0", 5*16("dct")     \n\t" \
+    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
+    "movdqa   "SREG2", %%xmm0         \n\t" \
+    "movdqa   %%xmm6, %%xmm4          \n\t" \
+    "psubsw   %%xmm1, "SREG2"         \n\t" \
+    "psubsw   "TAN1", %%xmm6          \n\t" \
+    "paddsw   %%xmm0, %%xmm1          \n\t" \
+    "paddsw   %%xmm4, "TAN1"          \n\t" \
+    "psraw        $6, %%xmm1          \n\t" \
+    "psraw        $6, "SREG2"         \n\t" \
+    "psraw        $6, "TAN1"          \n\t" \
+    "psraw        $6, %%xmm6          \n\t" \
+    "movdqa   %%xmm1, ("dct")         \n\t" \
+    "movdqa   "TAN1", 3*16("dct")     \n\t" \
+    "movdqa   %%xmm6, 4*16("dct")     \n\t" \
+    "movdqa   "SREG2", 7*16("dct")    \n\t"
+
+inline void ff_idct_xvid_sse2(short *block)
+{
+    __asm__ volatile(
+    "movq     "MANGLE(m127)", %%mm0                              \n\t"
+    iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),      PUT_EVEN(ROW0))
+    iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
+    iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
+
+    TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
+    JZ("%%eax", "1f")
+    iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
+
+    TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
+    TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
+    iLLM_HEAD
+    ASMALIGN(4)
+    JNZ("%%ecx", "2f")
+    JNZ("%%eax", "3f")
+    JNZ("%%edx", "4f")
+    JNZ("%%esi", "5f")
+    iLLM_PASS_SPARSE("%0")
+    "jmp 6f                                                      \n\t"
+    "2:                                                          \n\t"
+    iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
+    "3:                                                          \n\t"
+    iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
+    JZ("%%edx", "1f")
+    "4:                                                          \n\t"
+    iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
+    JZ("%%esi", "1f")
+    "5:                                                          \n\t"
+    iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
+#ifndef ARCH_X86_64
+    iLLM_HEAD
+#endif
+    iLLM_PASS("%0")
+    "6:                                                          \n\t"
+    : "+r"(block)
+    :
+    : "%eax", "%ecx", "%edx", "%esi", "memory");
+}
+
+void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
+{
+    ff_idct_xvid_sse2(block);
+    put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
+{
+    ff_idct_xvid_sse2(block);
+    add_pixels_clamped_mmx(block, dest, line_size);
+}
diff --git a/libavcodec/x86/idct_xvid.h b/libavcodec/x86/idct_xvid.h
new file mode 100644
index 0000000..bddbdb9
--- /dev/null
+++ b/libavcodec/x86/idct_xvid.h
@@ -0,0 +1,37 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*!
+ * @file idct_xvid.h
+ * header for Xvid IDCT functions
+ */
+
+#ifndef AVCODEC_X86_IDCT_XVID_H
+#define AVCODEC_X86_IDCT_XVID_H
+
+#include <stdint.h>
+
+void ff_idct_xvid_mmx(short *block);
+void ff_idct_xvid_mmx2(short *block);
+void ff_idct_xvid_sse2(short *block);
+void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block);
+void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block);
+
+#endif /* AVCODEC_X86_IDCT_XVID_H */
diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h
new file mode 100644
index 0000000..95377ac
--- /dev/null
+++ b/libavcodec/x86/mathops.h
@@ -0,0 +1,43 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni at gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_MATHOPS_H
+#define AVCODEC_X86_MATHOPS_H
+
+#define MULL(ra, rb, shift) \
+        ({ int rt, dummy; __asm__ (\
+            "imull %3               \n\t"\
+            "shrdl %4, %%edx, %%eax \n\t"\
+            : "=a"(rt), "=d"(dummy)\
+            : "a" ((int)ra), "rm" ((int)rb), "i"(shift));\
+         rt; })
+
+#define MULH(ra, rb) \
+    ({ int rt, dummy;\
+     __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\
+     rt; })
+
+#define MUL64(ra, rb) \
+    ({ int64_t rt;\
+     __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\
+     rt; })
+
+#endif /* AVCODEC_X86_MATHOPS_H */
diff --git a/libavcodec/x86/mmx.h b/libavcodec/x86/mmx.h
new file mode 100644
index 0000000..d7a76bb
--- /dev/null
+++ b/libavcodec/x86/mmx.h
@@ -0,0 +1,267 @@
+/*
+ * mmx.h
+ * Copyright (C) 1997-2001 H. Dietz and R. Fisher
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_X86_MMX_H
+#define AVCODEC_X86_MMX_H
+
+#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected.
+
+
+#define         mmx_i2r(op,imm,reg) \
+        __asm__ volatile (#op " %0, %%" #reg \
+                              : /* nothing */ \
+                              : "i" (imm) )
+
+#define         mmx_m2r(op,mem,reg) \
+        __asm__ volatile (#op " %0, %%" #reg \
+                              : /* nothing */ \
+                              : "m" (mem))
+
+#define         mmx_r2m(op,reg,mem) \
+        __asm__ volatile (#op " %%" #reg ", %0" \
+                              : "=m" (mem) \
+                              : /* nothing */ )
+
+#define         mmx_r2r(op,regs,regd) \
+        __asm__ volatile (#op " %" #regs ", %" #regd)
+
+
+#define         emms() __asm__ volatile ("emms")
+
+#define         movd_m2r(var,reg)           mmx_m2r (movd, var, reg)
+#define         movd_r2m(reg,var)           mmx_r2m (movd, reg, var)
+#define         movd_r2r(regs,regd)         mmx_r2r (movd, regs, regd)
+
+#define         movq_m2r(var,reg)           mmx_m2r (movq, var, reg)
+#define         movq_r2m(reg,var)           mmx_r2m (movq, reg, var)
+#define         movq_r2r(regs,regd)         mmx_r2r (movq, regs, regd)
+
+#define         packssdw_m2r(var,reg)       mmx_m2r (packssdw, var, reg)
+#define         packssdw_r2r(regs,regd)     mmx_r2r (packssdw, regs, regd)
+#define         packsswb_m2r(var,reg)       mmx_m2r (packsswb, var, reg)
+#define         packsswb_r2r(regs,regd)     mmx_r2r (packsswb, regs, regd)
+
+#define         packuswb_m2r(var,reg)       mmx_m2r (packuswb, var, reg)
+#define         packuswb_r2r(regs,regd)     mmx_r2r (packuswb, regs, regd)
+
+#define         paddb_m2r(var,reg)          mmx_m2r (paddb, var, reg)
+#define         paddb_r2r(regs,regd)        mmx_r2r (paddb, regs, regd)
+#define         paddd_m2r(var,reg)          mmx_m2r (paddd, var, reg)
+#define         paddd_r2r(regs,regd)        mmx_r2r (paddd, regs, regd)
+#define         paddw_m2r(var,reg)          mmx_m2r (paddw, var, reg)
+#define         paddw_r2r(regs,regd)        mmx_r2r (paddw, regs, regd)
+
+#define         paddsb_m2r(var,reg)         mmx_m2r (paddsb, var, reg)
+#define         paddsb_r2r(regs,regd)       mmx_r2r (paddsb, regs, regd)
+#define         paddsw_m2r(var,reg)         mmx_m2r (paddsw, var, reg)
+#define         paddsw_r2r(regs,regd)       mmx_r2r (paddsw, regs, regd)
+
+#define         paddusb_m2r(var,reg)        mmx_m2r (paddusb, var, reg)
+#define         paddusb_r2r(regs,regd)      mmx_r2r (paddusb, regs, regd)
+#define         paddusw_m2r(var,reg)        mmx_m2r (paddusw, var, reg)
+#define         paddusw_r2r(regs,regd)      mmx_r2r (paddusw, regs, regd)
+
+#define         pand_m2r(var,reg)           mmx_m2r (pand, var, reg)
+#define         pand_r2r(regs,regd)         mmx_r2r (pand, regs, regd)
+
+#define         pandn_m2r(var,reg)          mmx_m2r (pandn, var, reg)
+#define         pandn_r2r(regs,regd)        mmx_r2r (pandn, regs, regd)
+
+#define         pcmpeqb_m2r(var,reg)        mmx_m2r (pcmpeqb, var, reg)
+#define         pcmpeqb_r2r(regs,regd)      mmx_r2r (pcmpeqb, regs, regd)
+#define         pcmpeqd_m2r(var,reg)        mmx_m2r (pcmpeqd, var, reg)
+#define         pcmpeqd_r2r(regs,regd)      mmx_r2r (pcmpeqd, regs, regd)
+#define         pcmpeqw_m2r(var,reg)        mmx_m2r (pcmpeqw, var, reg)
+#define         pcmpeqw_r2r(regs,regd)      mmx_r2r (pcmpeqw, regs, regd)
+
+#define         pcmpgtb_m2r(var,reg)        mmx_m2r (pcmpgtb, var, reg)
+#define         pcmpgtb_r2r(regs,regd)      mmx_r2r (pcmpgtb, regs, regd)
+#define         pcmpgtd_m2r(var,reg)        mmx_m2r (pcmpgtd, var, reg)
+#define         pcmpgtd_r2r(regs,regd)      mmx_r2r (pcmpgtd, regs, regd)
+#define         pcmpgtw_m2r(var,reg)        mmx_m2r (pcmpgtw, var, reg)
+#define         pcmpgtw_r2r(regs,regd)      mmx_r2r (pcmpgtw, regs, regd)
+
+#define         pmaddwd_m2r(var,reg)        mmx_m2r (pmaddwd, var, reg)
+#define         pmaddwd_r2r(regs,regd)      mmx_r2r (pmaddwd, regs, regd)
+
+#define         pmulhw_m2r(var,reg)         mmx_m2r (pmulhw, var, reg)
+#define         pmulhw_r2r(regs,regd)       mmx_r2r (pmulhw, regs, regd)
+
+#define         pmullw_m2r(var,reg)         mmx_m2r (pmullw, var, reg)
+#define         pmullw_r2r(regs,regd)       mmx_r2r (pmullw, regs, regd)
+
+#define         por_m2r(var,reg)            mmx_m2r (por, var, reg)
+#define         por_r2r(regs,regd)          mmx_r2r (por, regs, regd)
+
+#define         pslld_i2r(imm,reg)          mmx_i2r (pslld, imm, reg)
+#define         pslld_m2r(var,reg)          mmx_m2r (pslld, var, reg)
+#define         pslld_r2r(regs,regd)        mmx_r2r (pslld, regs, regd)
+#define         psllq_i2r(imm,reg)          mmx_i2r (psllq, imm, reg)
+#define         psllq_m2r(var,reg)          mmx_m2r (psllq, var, reg)
+#define         psllq_r2r(regs,regd)        mmx_r2r (psllq, regs, regd)
+#define         psllw_i2r(imm,reg)          mmx_i2r (psllw, imm, reg)
+#define         psllw_m2r(var,reg)          mmx_m2r (psllw, var, reg)
+#define         psllw_r2r(regs,regd)        mmx_r2r (psllw, regs, regd)
+
+#define         psrad_i2r(imm,reg)          mmx_i2r (psrad, imm, reg)
+#define         psrad_m2r(var,reg)          mmx_m2r (psrad, var, reg)
+#define         psrad_r2r(regs,regd)        mmx_r2r (psrad, regs, regd)
+#define         psraw_i2r(imm,reg)          mmx_i2r (psraw, imm, reg)
+#define         psraw_m2r(var,reg)          mmx_m2r (psraw, var, reg)
+#define         psraw_r2r(regs,regd)        mmx_r2r (psraw, regs, regd)
+
+#define         psrld_i2r(imm,reg)          mmx_i2r (psrld, imm, reg)
+#define         psrld_m2r(var,reg)          mmx_m2r (psrld, var, reg)
+#define         psrld_r2r(regs,regd)        mmx_r2r (psrld, regs, regd)
+#define         psrlq_i2r(imm,reg)          mmx_i2r (psrlq, imm, reg)
+#define         psrlq_m2r(var,reg)          mmx_m2r (psrlq, var, reg)
+#define         psrlq_r2r(regs,regd)        mmx_r2r (psrlq, regs, regd)
+#define         psrlw_i2r(imm,reg)          mmx_i2r (psrlw, imm, reg)
+#define         psrlw_m2r(var,reg)          mmx_m2r (psrlw, var, reg)
+#define         psrlw_r2r(regs,regd)        mmx_r2r (psrlw, regs, regd)
+
+#define         psubb_m2r(var,reg)          mmx_m2r (psubb, var, reg)
+#define         psubb_r2r(regs,regd)        mmx_r2r (psubb, regs, regd)
+#define         psubd_m2r(var,reg)          mmx_m2r (psubd, var, reg)
+#define         psubd_r2r(regs,regd)        mmx_r2r (psubd, regs, regd)
+#define         psubw_m2r(var,reg)          mmx_m2r (psubw, var, reg)
+#define         psubw_r2r(regs,regd)        mmx_r2r (psubw, regs, regd)
+
+#define         psubsb_m2r(var,reg)         mmx_m2r (psubsb, var, reg)
+#define         psubsb_r2r(regs,regd)       mmx_r2r (psubsb, regs, regd)
+#define         psubsw_m2r(var,reg)         mmx_m2r (psubsw, var, reg)
+#define         psubsw_r2r(regs,regd)       mmx_r2r (psubsw, regs, regd)
+
+#define         psubusb_m2r(var,reg)        mmx_m2r (psubusb, var, reg)
+#define         psubusb_r2r(regs,regd)      mmx_r2r (psubusb, regs, regd)
+#define         psubusw_m2r(var,reg)        mmx_m2r (psubusw, var, reg)
+#define         psubusw_r2r(regs,regd)      mmx_r2r (psubusw, regs, regd)
+
+#define         punpckhbw_m2r(var,reg)      mmx_m2r (punpckhbw, var, reg)
+#define         punpckhbw_r2r(regs,regd)    mmx_r2r (punpckhbw, regs, regd)
+#define         punpckhdq_m2r(var,reg)      mmx_m2r (punpckhdq, var, reg)
+#define         punpckhdq_r2r(regs,regd)    mmx_r2r (punpckhdq, regs, regd)
+#define         punpckhwd_m2r(var,reg)      mmx_m2r (punpckhwd, var, reg)
+#define         punpckhwd_r2r(regs,regd)    mmx_r2r (punpckhwd, regs, regd)
+
+#define         punpcklbw_m2r(var,reg)      mmx_m2r (punpcklbw, var, reg)
+#define         punpcklbw_r2r(regs,regd)    mmx_r2r (punpcklbw, regs, regd)
+#define         punpckldq_m2r(var,reg)      mmx_m2r (punpckldq, var, reg)
+#define         punpckldq_r2r(regs,regd)    mmx_r2r (punpckldq, regs, regd)
+#define         punpcklwd_m2r(var,reg)      mmx_m2r (punpcklwd, var, reg)
+#define         punpcklwd_r2r(regs,regd)    mmx_r2r (punpcklwd, regs, regd)
+
+#define         pxor_m2r(var,reg)           mmx_m2r (pxor, var, reg)
+#define         pxor_r2r(regs,regd)         mmx_r2r (pxor, regs, regd)
+
+
+/* 3DNOW extensions */
+
+#define         pavgusb_m2r(var,reg)        mmx_m2r (pavgusb, var, reg)
+#define         pavgusb_r2r(regs,regd)      mmx_r2r (pavgusb, regs, regd)
+
+
+/* AMD MMX extensions - also available in intel SSE */
+
+
+#define         mmx_m2ri(op,mem,reg,imm) \
+        __asm__ volatile (#op " %1, %0, %%" #reg \
+                              : /* nothing */ \
+                              : "m" (mem), "i" (imm))
+#define         mmx_r2ri(op,regs,regd,imm) \
+        __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \
+                              : /* nothing */ \
+                              : "i" (imm) )
+
+#define         mmx_fetch(mem,hint) \
+        __asm__ volatile ("prefetch" #hint " %0" \
+                              : /* nothing */ \
+                              : "m" (mem))
+
+
+#define         maskmovq(regs,maskreg)      mmx_r2ri (maskmovq, regs, maskreg)
+
+#define         movntq_r2m(mmreg,var)       mmx_r2m (movntq, mmreg, var)
+
+#define         pavgb_m2r(var,reg)          mmx_m2r (pavgb, var, reg)
+#define         pavgb_r2r(regs,regd)        mmx_r2r (pavgb, regs, regd)
+#define         pavgw_m2r(var,reg)          mmx_m2r (pavgw, var, reg)
+#define         pavgw_r2r(regs,regd)        mmx_r2r (pavgw, regs, regd)
+
+#define         pextrw_r2r(mmreg,reg,imm)   mmx_r2ri (pextrw, mmreg, reg, imm)
+
+#define         pinsrw_r2r(reg,mmreg,imm)   mmx_r2ri (pinsrw, reg, mmreg, imm)
+
+#define         pmaxsw_m2r(var,reg)         mmx_m2r (pmaxsw, var, reg)
+#define         pmaxsw_r2r(regs,regd)       mmx_r2r (pmaxsw, regs, regd)
+
+#define         pmaxub_m2r(var,reg)         mmx_m2r (pmaxub, var, reg)
+#define         pmaxub_r2r(regs,regd)       mmx_r2r (pmaxub, regs, regd)
+
+#define         pminsw_m2r(var,reg)         mmx_m2r (pminsw, var, reg)
+#define         pminsw_r2r(regs,regd)       mmx_r2r (pminsw, regs, regd)
+
+#define         pminub_m2r(var,reg)         mmx_m2r (pminub, var, reg)
+#define         pminub_r2r(regs,regd)       mmx_r2r (pminub, regs, regd)
+
+#define         pmovmskb(mmreg,reg) \
+        __asm__ volatile ("movmskps %" #mmreg ", %" #reg)
+
+#define         pmulhuw_m2r(var,reg)        mmx_m2r (pmulhuw, var, reg)
+#define         pmulhuw_r2r(regs,regd)      mmx_r2r (pmulhuw, regs, regd)
+
+#define         prefetcht0(mem)             mmx_fetch (mem, t0)
+#define         prefetcht1(mem)             mmx_fetch (mem, t1)
+#define         prefetcht2(mem)             mmx_fetch (mem, t2)
+#define         prefetchnta(mem)            mmx_fetch (mem, nta)
+
+#define         psadbw_m2r(var,reg)         mmx_m2r (psadbw, var, reg)
+#define         psadbw_r2r(regs,regd)       mmx_r2r (psadbw, regs, regd)
+
+#define         pshufw_m2r(var,reg,imm)     mmx_m2ri(pshufw, var, reg, imm)
+#define         pshufw_r2r(regs,regd,imm)   mmx_r2ri(pshufw, regs, regd, imm)
+
+#define         sfence() __asm__ volatile ("sfence\n\t")
+
+/* SSE2 */
+#define         pshufhw_m2r(var,reg,imm)    mmx_m2ri(pshufhw, var, reg, imm)
+#define         pshufhw_r2r(regs,regd,imm)  mmx_r2ri(pshufhw, regs, regd, imm)
+#define         pshuflw_m2r(var,reg,imm)    mmx_m2ri(pshuflw, var, reg, imm)
+#define         pshuflw_r2r(regs,regd,imm)  mmx_r2ri(pshuflw, regs, regd, imm)
+
+#define         pshufd_r2r(regs,regd,imm)   mmx_r2ri(pshufd, regs, regd, imm)
+
+#define         movdqa_m2r(var,reg)         mmx_m2r (movdqa, var, reg)
+#define         movdqa_r2m(reg,var)         mmx_r2m (movdqa, reg, var)
+#define         movdqa_r2r(regs,regd)       mmx_r2r (movdqa, regs, regd)
+#define         movdqu_m2r(var,reg)         mmx_m2r (movdqu, var, reg)
+#define         movdqu_r2m(reg,var)         mmx_r2m (movdqu, reg, var)
+#define         movdqu_r2r(regs,regd)       mmx_r2r (movdqu, regs, regd)
+
+#define         pmullw_r2m(reg,var)         mmx_r2m (pmullw, reg, var)
+
+#define         pslldq_i2r(imm,reg)         mmx_i2r (pslldq, imm, reg)
+#define         psrldq_i2r(imm,reg)         mmx_i2r (psrldq, imm, reg)
+
+#define         punpcklqdq_r2r(regs,regd)   mmx_r2r (punpcklqdq, regs, regd)
+#define         punpckhqdq_r2r(regs,regd)   mmx_r2r (punpckhqdq, regs, regd)
+
+
+#endif /* AVCODEC_X86_MMX_H */
diff --git a/libavcodec/i386/motion_est_mmx.c b/libavcodec/x86/motion_est_mmx.c
similarity index 100%
rename from libavcodec/i386/motion_est_mmx.c
rename to libavcodec/x86/motion_est_mmx.c
diff --git a/libavcodec/i386/mpegvideo_mmx.c b/libavcodec/x86/mpegvideo_mmx.c
similarity index 100%
rename from libavcodec/i386/mpegvideo_mmx.c
rename to libavcodec/x86/mpegvideo_mmx.c
diff --git a/libavcodec/i386/mpegvideo_mmx_template.c b/libavcodec/x86/mpegvideo_mmx_template.c
similarity index 100%
rename from libavcodec/i386/mpegvideo_mmx_template.c
rename to libavcodec/x86/mpegvideo_mmx_template.c
diff --git a/libavcodec/x86/rv40dsp_mmx.c b/libavcodec/x86/rv40dsp_mmx.c
new file mode 100644
index 0000000..47461c6
--- /dev/null
+++ b/libavcodec/x86/rv40dsp_mmx.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2008 Konstantin Shishkov, Mathieu Velten
+ *
+ * MMX-optimized DSP functions for RV40, based on H.264 optimizations by
+ * Michael Niedermayer and Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dsputil_mmx.h"
+
+/* bias interleaved with bias div 8, use p+1 to access bias div 8 */
+DECLARE_ALIGNED_8(static const uint64_t, rv40_bias_reg[4][8]) = {
+    { 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0010001000100010ULL, 0x0002000200020002ULL,
+      0x0020002000200020ULL, 0x0004000400040004ULL, 0x0010001000100010ULL, 0x0002000200020002ULL },
+    { 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL,
+      0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL },
+    { 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0020002000200020ULL, 0x0004000400040004ULL,
+      0x0010001000100010ULL, 0x0002000200020002ULL, 0x0020002000200020ULL, 0x0004000400040004ULL },
+    { 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL,
+      0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL }
+};
+
+static void put_rv40_chroma_mc8_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
+static void put_rv40_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
+static void avg_rv40_chroma_mc8_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
+static void avg_rv40_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
+static void avg_rv40_chroma_mc8_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
+static void avg_rv40_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
diff --git a/libavcodec/x86/simple_idct_mmx.c b/libavcodec/x86/simple_idct_mmx.c
new file mode 100644
index 0000000..5786744
--- /dev/null
+++ b/libavcodec/x86/simple_idct_mmx.c
@@ -0,0 +1,1295 @@
+/*
+ * Simple IDCT MMX
+ *
+ * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavcodec/dsputil.h"
+#include "libavcodec/simple_idct.h"
+
+/*
+23170.475006
+22725.260826
+21406.727617
+19265.545870
+16384.000000
+12872.826198
+8866.956905
+4520.335430
+*/
+#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#if 0
+#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#else
+#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
+#endif
+#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 20 // 6
+
+DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
+
+DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
+        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
+//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
+//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
+        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
+        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
+//        0, 0, 0, 0,
+//        0, 0, 0, 0,
+
+ C4,  C4,  C4,  C4,
+ C4, -C4,  C4, -C4,
+
+ C2,  C6,  C2,  C6,
+ C6, -C2,  C6, -C2,
+
+ C1,  C3,  C1,  C3,
+ C5,  C7,  C5,  C7,
+
+ C3, -C7,  C3, -C7,
+-C1, -C5, -C1, -C5,
+
+ C5, -C1,  C5, -C1,
+ C7,  C3,  C7,  C3,
+
+ C7, -C5,  C7, -C5,
+ C3, -C1,  C3, -C1
+};
+
+#if 0
+static void unused_var_killer(void)
+{
+        int a= wm1010 + d40000;
+        temp[0]=a;
+}
+
+static void inline idctCol (int16_t * col, int16_t *input)
+{
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+        int a0, a1, a2, a3, b0, b1, b2, b3;
+        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+/*
+        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
+                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
+                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
+                return;
+        }*/
+
+col[8*0] = input[8*0 + 0];
+col[8*1] = input[8*2 + 0];
+col[8*2] = input[8*0 + 1];
+col[8*3] = input[8*2 + 1];
+col[8*4] = input[8*4 + 0];
+col[8*5] = input[8*6 + 0];
+col[8*6] = input[8*4 + 1];
+col[8*7] = input[8*6 + 1];
+
+        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
+        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
+        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
+        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
+
+        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
+        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
+        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
+        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
+
+        col[8*0] = (a0 + b0) >> COL_SHIFT;
+        col[8*1] = (a1 + b1) >> COL_SHIFT;
+        col[8*2] = (a2 + b2) >> COL_SHIFT;
+        col[8*3] = (a3 + b3) >> COL_SHIFT;
+        col[8*4] = (a3 - b3) >> COL_SHIFT;
+        col[8*5] = (a2 - b2) >> COL_SHIFT;
+        col[8*6] = (a1 - b1) >> COL_SHIFT;
+        col[8*7] = (a0 - b0) >> COL_SHIFT;
+}
+
+static void inline idctRow (int16_t * output, int16_t * input)
+{
+        int16_t row[8];
+
+        int a0, a1, a2, a3, b0, b1, b2, b3;
+        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+
+row[0] = input[0];
+row[2] = input[1];
+row[4] = input[4];
+row[6] = input[5];
+row[1] = input[8];
+row[3] = input[9];
+row[5] = input[12];
+row[7] = input[13];
+
+        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
+                row[0] = row[1] = row[2] = row[3] = row[4] =
+                        row[5] = row[6] = row[7] = row[0]<<3;
+        output[0]  = row[0];
+        output[2]  = row[1];
+        output[4]  = row[2];
+        output[6]  = row[3];
+        output[8]  = row[4];
+        output[10] = row[5];
+        output[12] = row[6];
+        output[14] = row[7];
+                return;
+        }
+
+        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
+        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
+        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
+        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
+
+        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+        row[0] = (a0 + b0) >> ROW_SHIFT;
+        row[1] = (a1 + b1) >> ROW_SHIFT;
+        row[2] = (a2 + b2) >> ROW_SHIFT;
+        row[3] = (a3 + b3) >> ROW_SHIFT;
+        row[4] = (a3 - b3) >> ROW_SHIFT;
+        row[5] = (a2 - b2) >> ROW_SHIFT;
+        row[6] = (a1 - b1) >> ROW_SHIFT;
+        row[7] = (a0 - b0) >> ROW_SHIFT;
+
+        output[0]  = row[0];
+        output[2]  = row[1];
+        output[4]  = row[2];
+        output[6]  = row[3];
+        output[8]  = row[4];
+        output[10] = row[5];
+        output[12] = row[6];
+        output[14] = row[7];
+}
+#endif
+
+static inline void idct(int16_t *block)
+{
+        DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
+        int16_t * const temp= (int16_t*)align_tmp;
+
+        __asm__ volatile(
+#if 0 //Alternative, simpler variant
+
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
+        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
+        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
+        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+        #rounder ", %%mm4               \n\t"\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
+        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
+        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
+        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+        #rounder ", %%mm0               \n\t"\
+        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
+        "paddd %%mm0, %%mm0             \n\t" \
+        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
+        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
+        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
+        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
+        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
+        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
+        "movq %%mm7, " #dst "           \n\t"\
+        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
+        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
+        "movq %%mm2, 24+" #dst "        \n\t"\
+        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
+        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
+        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
+        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
+        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
+        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
+        "movq %%mm2, 8+" #dst "         \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
+        "movq %%mm4, 16+" #dst "        \n\t"\
+
+#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
+        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
+        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
+        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
+        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
+        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
+        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
+        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
+        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
+        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
+        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
+        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
+        "movd %%mm7, " #dst "           \n\t"\
+        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
+        "movd %%mm0, 16+" #dst "        \n\t"\
+        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
+        "movd %%mm2, 96+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
+        "movd %%mm4, 112+" #dst "       \n\t"\
+        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
+        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
+        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
+        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
+        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "psrad $" #shift ", %%mm5       \n\t"\
+        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
+        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
+        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
+        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
+        "movd %%mm2, 32+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
+        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
+        "movd %%mm6, 48+" #dst "        \n\t"\
+        "movd %%mm4, 64+" #dst "        \n\t"\
+        "movd %%mm5, 80+" #dst "        \n\t"\
+
+
+#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
+        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
+        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
+        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
+        "pand %%mm0, %%mm4              \n\t"\
+        "por %%mm1, %%mm4               \n\t"\
+        "por %%mm2, %%mm4               \n\t"\
+        "por %%mm3, %%mm4               \n\t"\
+        "packssdw %%mm4,%%mm4           \n\t"\
+        "movd %%mm4, %%eax              \n\t"\
+        "orl %%eax, %%eax               \n\t"\
+        "jz 1f                          \n\t"\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
+        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+        #rounder ", %%mm4               \n\t"\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
+        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
+        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
+        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+        #rounder ", %%mm0               \n\t"\
+        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
+        "paddd %%mm0, %%mm0             \n\t" \
+        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
+        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
+        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
+        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
+        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
+        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
+        "movq %%mm7, " #dst "           \n\t"\
+        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
+        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
+        "movq %%mm2, 24+" #dst "        \n\t"\
+        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
+        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
+        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
+        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
+        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
+        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
+        "movq %%mm2, 8+" #dst "         \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
+        "movq %%mm4, 16+" #dst "        \n\t"\
+        "jmp 2f                         \n\t"\
+        "1:                             \n\t"\
+        "pslld $16, %%mm0               \n\t"\
+        "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
+        "psrad $13, %%mm0               \n\t"\
+        "packssdw %%mm0, %%mm0          \n\t"\
+        "movq %%mm0, " #dst "           \n\t"\
+        "movq %%mm0, 8+" #dst "         \n\t"\
+        "movq %%mm0, 16+" #dst "        \n\t"\
+        "movq %%mm0, 24+" #dst "        \n\t"\
+        "2:                             \n\t"
+
+
+//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
+ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
+/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
+ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
+ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
+
+DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
+
+
+//IDCT(      src0,   src4,   src1,    src5,    dst, shift)
+COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+
+#else
+
+#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
+        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
+        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
+        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
+        "pand %%mm0, %%mm4              \n\t"\
+        "por %%mm1, %%mm4               \n\t"\
+        "por %%mm2, %%mm4               \n\t"\
+        "por %%mm3, %%mm4               \n\t"\
+        "packssdw %%mm4,%%mm4           \n\t"\
+        "movd %%mm4, %%eax              \n\t"\
+        "orl %%eax, %%eax               \n\t"\
+        "jz 1f                          \n\t"\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
+        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+        #rounder ", %%mm4               \n\t"\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
+        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
+        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
+        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+        #rounder ", %%mm0               \n\t"\
+        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
+        "paddd %%mm0, %%mm0             \n\t" \
+        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
+        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
+        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
+        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
+        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
+        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
+        "movq %%mm7, " #dst "           \n\t"\
+        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
+        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
+        "movq %%mm2, 24+" #dst "        \n\t"\
+        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
+        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
+        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
+        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
+        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
+        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
+        "movq %%mm2, 8+" #dst "         \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
+        "movq %%mm4, 16+" #dst "        \n\t"\
+        "jmp 2f                         \n\t"\
+        "1:                             \n\t"\
+        "pslld $16, %%mm0               \n\t"\
+        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
+        "psrad $13, %%mm0               \n\t"\
+        "packssdw %%mm0, %%mm0          \n\t"\
+        "movq %%mm0, " #dst "           \n\t"\
+        "movq %%mm0, 8+" #dst "         \n\t"\
+        "movq %%mm0, 16+" #dst "        \n\t"\
+        "movq %%mm0, 24+" #dst "        \n\t"\
+        "2:                             \n\t"
+
+#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
+        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
+        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
+        "movq %%mm0, %%mm4              \n\t"\
+        "por %%mm1, %%mm4               \n\t"\
+        "por %%mm2, %%mm4               \n\t"\
+        "por %%mm3, %%mm4               \n\t"\
+        "packssdw %%mm4,%%mm4           \n\t"\
+        "movd %%mm4, %%eax              \n\t"\
+        "orl %%eax, %%eax               \n\t"\
+        "jz " #bt "                     \n\t"\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
+        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+        #rounder ", %%mm4               \n\t"\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
+        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
+        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
+        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+        #rounder ", %%mm0               \n\t"\
+        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
+        "paddd %%mm0, %%mm0             \n\t" \
+        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
+        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
+        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
+        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
+        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
+        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
+        "movq %%mm7, " #dst "           \n\t"\
+        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
+        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
+        "movq %%mm2, 24+" #dst "        \n\t"\
+        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
+        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
+        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
+        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
+        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
+        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
+        "movq %%mm2, 8+" #dst "         \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
+        "movq %%mm4, 16+" #dst "        \n\t"\
+
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
+        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
+        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
+        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+        #rounder ", %%mm4               \n\t"\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
+        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
+        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
+        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+        #rounder ", %%mm0               \n\t"\
+        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
+        "paddd %%mm0, %%mm0             \n\t" \
+        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
+        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
+        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
+        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
+        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
+        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
+        "movq %%mm7, " #dst "           \n\t"\
+        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
+        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
+        "movq %%mm2, 24+" #dst "        \n\t"\
+        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
+        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
+        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
+        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
+        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
+        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
+        "movq %%mm2, 8+" #dst "         \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
+        "movq %%mm4, 16+" #dst "        \n\t"\
+
+//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
+DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
+Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
+Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
+        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
+        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
+        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
+        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
+        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
+        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
+        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
+        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
+        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
+        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
+        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
+        "movd %%mm7, " #dst "           \n\t"\
+        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
+        "movd %%mm0, 16+" #dst "        \n\t"\
+        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
+        "movd %%mm2, 96+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
+        "movd %%mm4, 112+" #dst "       \n\t"\
+        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
+        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
+        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
+        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
+        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "psrad $" #shift ", %%mm5       \n\t"\
+        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
+        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
+        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
+        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
+        "movd %%mm2, 32+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
+        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
+        "movd %%mm6, 48+" #dst "        \n\t"\
+        "movd %%mm4, 64+" #dst "        \n\t"\
+        "movd %%mm5, 80+" #dst "        \n\t"
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+        "jmp 9f                         \n\t"
+
+        "#" ASMALIGN(4)                      \
+        "4:                             \n\t"
+Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
+        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
+        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
+        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
+        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
+        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
+        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
+        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
+        "movd %%mm1, " #dst "           \n\t"\
+        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
+        "movd %%mm0, 16+" #dst "        \n\t"\
+        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
+        "movd %%mm2, 96+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
+        "movd %%mm4, 112+" #dst "       \n\t"\
+        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
+        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
+        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "psrad $" #shift ", %%mm5       \n\t"\
+        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
+        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
+        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
+        "movd %%mm2, 32+" #dst "        \n\t"\
+        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
+        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
+        "movd %%mm6, 48+" #dst "        \n\t"\
+        "movd %%mm1, 64+" #dst "        \n\t"\
+        "movd %%mm5, 80+" #dst "        \n\t"
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+        "jmp 9f                         \n\t"
+
+        "#" ASMALIGN(4)                      \
+        "6:                             \n\t"
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
+        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
+        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
+        "movd %%mm1, " #dst "           \n\t"\
+        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
+        "movd %%mm0, 16+" #dst "        \n\t"\
+        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
+        "movd %%mm2, 96+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
+        "movd %%mm4, 112+" #dst "       \n\t"\
+        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
+        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
+        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "psrad $" #shift ", %%mm5       \n\t"\
+        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
+        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
+        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
+        "movd %%mm2, 32+" #dst "        \n\t"\
+        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
+        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
+        "movd %%mm6, 48+" #dst "        \n\t"\
+        "movd %%mm1, 64+" #dst "        \n\t"\
+        "movd %%mm5, 80+" #dst "        \n\t"
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+        "jmp 9f                         \n\t"
+
+        "#" ASMALIGN(4)                      \
+        "2:                             \n\t"
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
+        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
+        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
+        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
+        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
+        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
+        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
+        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
+        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
+        "movd %%mm7, " #dst "           \n\t"\
+        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
+        "movd %%mm0, 16+" #dst "        \n\t"\
+        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
+        "movd %%mm2, 96+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
+        "movd %%mm4, 112+" #dst "       \n\t"\
+        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
+        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
+        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
+        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
+        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
+        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
+        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "psrad $" #shift ", %%mm5       \n\t"\
+        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
+        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
+        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
+        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
+        "movd %%mm2, 32+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
+        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
+        "movd %%mm6, 48+" #dst "        \n\t"\
+        "movd %%mm4, 64+" #dst "        \n\t"\
+        "movd %%mm5, 80+" #dst "        \n\t"
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+        "jmp 9f                         \n\t"
+
+        "#" ASMALIGN(4)                      \
+        "3:                             \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
+        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 64(%2), %%mm3             \n\t"\
+        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
+        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
+        "movd %%mm7, " #dst "           \n\t"\
+        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
+        "movd %%mm0, 16+" #dst "        \n\t"\
+        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
+        "movd %%mm1, 96+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
+        "movd %%mm4, 112+" #dst "       \n\t"\
+        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
+        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
+        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "psrad $" #shift ", %%mm5       \n\t"\
+        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
+        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
+        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
+        "movd %%mm1, 32+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
+        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
+        "movd %%mm6, 48+" #dst "        \n\t"\
+        "movd %%mm4, 64+" #dst "        \n\t"\
+        "movd %%mm5, 80+" #dst "        \n\t"
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+        "jmp 9f                         \n\t"
+
+        "#" ASMALIGN(4)                      \
+        "5:                             \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
+        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
+        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
+        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
+        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
+        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
+        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
+        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
+        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
+        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
+        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
+        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm3       \n\t"\
+        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
+        "movq %%mm4, " #dst "           \n\t"\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
+        "movq %%mm0, 16+" #dst "        \n\t"\
+        "movq %%mm0, 96+" #dst "        \n\t"\
+        "movq %%mm4, 112+" #dst "       \n\t"\
+        "psrad $" #shift ", %%mm5       \n\t"\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
+        "movq %%mm5, 32+" #dst "        \n\t"\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
+        "movq %%mm6, 48+" #dst "        \n\t"\
+        "movq %%mm6, 64+" #dst "        \n\t"\
+        "movq %%mm5, 80+" #dst "        \n\t"
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+        "jmp 9f                         \n\t"
+
+
+        "#" ASMALIGN(4)                      \
+        "1:                             \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
+        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
+        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
+        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
+        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
+        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
+        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
+        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
+        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
+        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
+        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
+        "movq 64(%2), %%mm1             \n\t"\
+        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
+        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
+        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
+        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
+        "psrad $" #shift ", %%mm7       \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
+        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
+        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "psrad $" #shift ", %%mm3       \n\t"\
+        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
+        "movd %%mm7, " #dst "           \n\t"\
+        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
+        "movd %%mm0, 16+" #dst "        \n\t"\
+        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
+        "movd %%mm3, 96+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
+        "movd %%mm4, 112+" #dst "       \n\t"\
+        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
+        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
+        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
+        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
+        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
+        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
+        "psrad $" #shift ", %%mm3       \n\t"\
+        "psrad $" #shift ", %%mm5       \n\t"\
+        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
+        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
+        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
+        "psrad $" #shift ", %%mm6       \n\t"\
+        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
+        "movd %%mm3, 32+" #dst "        \n\t"\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
+        "movd %%mm6, 48+" #dst "        \n\t"\
+        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
+        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
+        "movd %%mm4, 64+" #dst "        \n\t"\
+        "movd %%mm5, 80+" #dst "        \n\t"
+
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+        "jmp 9f                         \n\t"
+
+
+        "#" ASMALIGN(4)
+        "7:                             \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
+        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "psrad $" #shift ", %%mm4       \n\t"\
+        "psrad $" #shift ", %%mm0       \n\t"\
+        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
+        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
+        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
+        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
+        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
+        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
+        "psrad $" #shift ", %%mm1       \n\t"\
+        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
+        "movq %%mm4, " #dst "           \n\t"\
+        "psrad $" #shift ", %%mm2       \n\t"\
+        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
+        "movq %%mm0, 16+" #dst "        \n\t"\
+        "movq %%mm0, 96+" #dst "        \n\t"\
+        "movq %%mm4, 112+" #dst "       \n\t"\
+        "movq %%mm0, 32+" #dst "        \n\t"\
+        "movq %%mm4, 48+" #dst "        \n\t"\
+        "movq %%mm4, 64+" #dst "        \n\t"\
+        "movq %%mm0, 80+" #dst "        \n\t"
+
+//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
+IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+
+
+#endif
+
+/*
+Input
+ 00 40 04 44 20 60 24 64
+ 10 30 14 34 50 70 54 74
+ 01 41 03 43 21 61 23 63
+ 11 31 13 33 51 71 53 73
+ 02 42 06 46 22 62 26 66
+ 12 32 16 36 52 72 56 76
+ 05 45 07 47 25 65 27 67
+ 15 35 17 37 55 75 57 77
+
+Temp
+ 00 04 10 14 20 24 30 34
+ 40 44 50 54 60 64 70 74
+ 01 03 11 13 21 23 31 33
+ 41 43 51 53 61 63 71 73
+ 02 06 12 16 22 26 32 36
+ 42 46 52 56 62 66 72 76
+ 05 07 15 17 25 27 35 37
+ 45 47 55 57 65 67 75 77
+*/
+
+"9: \n\t"
+                :: "r" (block), "r" (temp), "r" (coeffs)
+                : "%eax"
+        );
+}
+
+void ff_simple_idct_mmx(int16_t *block)
+{
+    idct(block);
+}
+
+//FIXME merge add/put into the idct
+
+void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    idct(block);
+    put_pixels_clamped_mmx(block, dest, line_size);
+}
+void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    idct(block);
+    add_pixels_clamped_mmx(block, dest, line_size);
+}
diff --git a/libavcodec/i386/snowdsp_mmx.c b/libavcodec/x86/snowdsp_mmx.c
similarity index 100%
rename from libavcodec/i386/snowdsp_mmx.c
rename to libavcodec/x86/snowdsp_mmx.c
diff --git a/libavcodec/i386/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
similarity index 100%
rename from libavcodec/i386/vc1dsp_mmx.c
rename to libavcodec/x86/vc1dsp_mmx.c
diff --git a/libavcodec/i386/vp3dsp_mmx.c b/libavcodec/x86/vp3dsp_mmx.c
similarity index 100%
rename from libavcodec/i386/vp3dsp_mmx.c
rename to libavcodec/x86/vp3dsp_mmx.c
diff --git a/libavcodec/x86/vp3dsp_mmx.h b/libavcodec/x86/vp3dsp_mmx.h
new file mode 100644
index 0000000..e565a33
--- /dev/null
+++ b/libavcodec/x86/vp3dsp_mmx.h
@@ -0,0 +1,35 @@
+/*
+ * vp3dsp MMX function declarations
+ * Copyright (c) 2007 Aurelien Jacobs <aurel at gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP3DSP_MMX_H
+#define AVCODEC_X86_VP3DSP_MMX_H
+
+#include <stdint.h>
+#include "libavcodec/dsputil.h"
+
+void ff_vp3_idct_mmx(int16_t *data);
+void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
+
+void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
+void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
+
+#endif /* AVCODEC_X86_VP3DSP_MMX_H */
diff --git a/libavcodec/i386/vp3dsp_sse2.c b/libavcodec/x86/vp3dsp_sse2.c
similarity index 100%
rename from libavcodec/i386/vp3dsp_sse2.c
rename to libavcodec/x86/vp3dsp_sse2.c
diff --git a/libavcodec/x86/vp3dsp_sse2.h b/libavcodec/x86/vp3dsp_sse2.h
new file mode 100644
index 0000000..9094620
--- /dev/null
+++ b/libavcodec/x86/vp3dsp_sse2.h
@@ -0,0 +1,31 @@
+/*
+ * vp3dsp SSE2 function declarations
+ * Copyright (c) 2007 Aurelien Jacobs <aurel at gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP3DSP_SSE2_H
+#define AVCODEC_X86_VP3DSP_SSE2_H
+
+#include "libavcodec/dsputil.h"
+
+void ff_vp3_idct_sse2(int16_t *input_data);
+void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
+
+#endif /* AVCODEC_X86_VP3DSP_SSE2_H */
diff --git a/libavcodec/x86/x86inc.asm b/libavcodec/x86/x86inc.asm
new file mode 100644
index 0000000..3729b5b
--- /dev/null
+++ b/libavcodec/x86/x86inc.asm
@@ -0,0 +1,540 @@
+;*****************************************************************************
+;* x86inc.asm
+;*****************************************************************************
+;* Copyright (C) 2005-2008 Loren Merritt <lorenm at u.washington.edu>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+; FIXME: All of the 64bit asm functions that take a stride as an argument
+; via register, assume that the high dword of that register is filled with 0.
+; This is true in practice (since we never do any 64bit arithmetic on strides,
+; and x264's strides are all positive), but is not guaranteed by the ABI.
+
+; Name of the .rodata section.
+; Kludge: Something on OS X fails to align .rodata even given an align attribute,
+; so use a different read-only section.
+%macro SECTION_RODATA 0
+    %ifidn __OUTPUT_FORMAT__,macho64
+        SECTION .text align=16
+    %elifidn __OUTPUT_FORMAT__,macho
+        SECTION .text align=16
+        fakegot:
+    %else
+        SECTION .rodata align=16
+    %endif
+%endmacro
+
+; PIC support macros. All these macros are totally harmless when PIC is
+; not defined but can ruin everything if misused in PIC mode. On x86_32, shared
+; objects cannot directly access global variables by address, they need to
+; go through the GOT (global offset table). Most OSes do not care about it
+; and let you load non-shared .so objects (Linux, Win32...). However, OS X
+; requires PIC code in its .dylib objects.
+;
+; - GLOBAL should be used as a suffix for global addressing, eg.
+;     picgetgot ebx
+;     mov eax, [foo GLOBAL]
+;   instead of
+;     mov eax, [foo]
+;
+; - picgetgot computes the GOT address into the given register in PIC
+;   mode, otherwise does nothing. You need to do this before using GLOBAL.
+;   Before in both execution order and compiled code order (so GLOBAL knows
+;   which register the GOT is in).
+
+%ifndef PIC
+    %define GLOBAL
+    %macro picgetgot 1
+    %endmacro
+%elifdef ARCH_X86_64
+    %define PIC64
+    %define GLOBAL wrt rip
+    %macro picgetgot 1
+    %endmacro
+%else
+    %define PIC32
+    %ifidn __OUTPUT_FORMAT__,macho
+        ; There is no real global offset table on OS X, but we still
+        ; need to reference our variables by offset.
+        %macro picgetgot 1
+            call %%getgot
+          %%getgot:
+            pop %1
+            add %1, $$ - %%getgot
+            %undef GLOBAL
+            %define GLOBAL + %1 - fakegot
+        %endmacro
+    %else ; elf
+        extern _GLOBAL_OFFSET_TABLE_
+        %macro picgetgot 1
+            call %%getgot
+          %%getgot:
+            pop %1
+            add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
+            %undef GLOBAL
+            %define GLOBAL + %1 wrt ..gotoff
+        %endmacro
+    %endif
+%endif
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
+; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
+; %4 = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE
+
+; REP_RET:
+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
+; which are slow when a normal ret follows a branch.
+
+%macro DECLARE_REG 6
+    %define r%1q %2
+    %define r%1d %3
+    %define r%1w %4
+    %define r%1b %5
+    %define r%1m %6
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 2
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1b %2
+    %define e%1b %2
+%ifndef ARCH_X86_64
+    %define r%1  e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al
+DECLARE_REG_SIZE bx, bl
+DECLARE_REG_SIZE cx, cl
+DECLARE_REG_SIZE dx, dl
+DECLARE_REG_SIZE si, sil
+DECLARE_REG_SIZE di, dil
+DECLARE_REG_SIZE bp, bpl
+
+%ifdef ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro PUSH 1
+    push %1
+    %assign stack_offset stack_offset+gprsize
+%endmacro
+
+%macro POP 1
+    pop %1
+    %assign stack_offset stack_offset-gprsize
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rsp
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rsp
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assert failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1b r %+ %%i %+ b
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %assign n_arg_names %%i
+%endmacro
+
+%ifdef ARCH_X86_64 ;==========================================================
+%ifidn __OUTPUT_FORMAT__,win32
+
+DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
+DECLARE_REG 1, rdx, edx, dx,  dl,  edx
+DECLARE_REG 2, r8,  r8d, r8w, r8b, r8d
+DECLARE_REG 3, r9,  r9d, r9w, r9b, r9d
+DECLARE_REG 4, rdi, edi, di,  dil, [rsp + stack_offset + 40]
+DECLARE_REG 5, rsi, esi, si,  sil, [rsp + stack_offset + 48]
+DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
+%define r7m [rsp + stack_offset + 64]
+%define r8m [rsp + stack_offset + 72]
+
+%macro LOAD_IF_USED 2 ; reg_id, number_of_args
+    %if %1 < %2
+        mov r%1, [rsp + 8 + %1*8]
+    %endif
+%endmacro
+
+%else ;=======================================================================
+
+DECLARE_REG 0, rdi, edi, di,  dil, edi
+DECLARE_REG 1, rsi, esi, si,  sil, esi
+DECLARE_REG 2, rdx, edx, dx,  dl,  edx
+DECLARE_REG 3, rcx, ecx, cx,  cl,  ecx
+DECLARE_REG 4, r8,  r8d, r8w, r8b, r8d
+DECLARE_REG 5, r9,  r9d, r9w, r9b, r9d
+DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 8]
+%define r7m [rsp + stack_offset + 16]
+%define r8m [rsp + stack_offset + 24]
+
+%macro LOAD_IF_USED 2 ; reg_id, number_of_args
+    %if %1 < %2
+        mov r%1, [rsp - 40 + %1*8]
+    %endif
+%endmacro
+
+%endif ; !WIN64
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+    ASSERT %2 >= %1
+    ASSERT %2 <= 7
+    %assign stack_offset 0
+%ifidn __OUTPUT_FORMAT__,win32
+    LOAD_IF_USED 4, %1
+    LOAD_IF_USED 5, %1
+%endif
+    LOAD_IF_USED 6, %1
+    DEFINE_ARGS %4
+%endmacro
+
+%macro RET 0
+    ret
+%endmacro
+
+%macro REP_RET 0
+    rep ret
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, eax, ax, al,   [esp + stack_offset + 4]
+DECLARE_REG 1, ecx, ecx, cx, cl,   [esp + stack_offset + 8]
+DECLARE_REG 2, edx, edx, dx, dl,   [esp + stack_offset + 12]
+DECLARE_REG 3, ebx, ebx, bx, bl,   [esp + stack_offset + 16]
+DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
+DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
+DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
+%define r7m [esp + stack_offset + 32]
+%define r8m [esp + stack_offset + 36]
+%define rsp esp
+
+%macro PUSH_IF_USED 1 ; reg_id
+    %if %1 < regs_used
+        push r%1
+        %assign stack_offset stack_offset+4
+    %endif
+%endmacro
+
+%macro POP_IF_USED 1 ; reg_id
+    %if %1 < regs_used
+        pop r%1
+    %endif
+%endmacro
+
+%macro LOAD_IF_USED 2 ; reg_id, number_of_args
+    %if %1 < %2
+        mov r%1, [esp + stack_offset + 4 + %1*4]
+    %endif
+%endmacro
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+    ASSERT %2 >= %1
+    %assign stack_offset 0
+    %assign regs_used %2
+    %ifdef PIC
+    %if %3
+        %assign regs_used regs_used+1
+    %endif
+    %endif
+    ASSERT regs_used <= 7
+    PUSH_IF_USED 3
+    PUSH_IF_USED 4
+    PUSH_IF_USED 5
+    PUSH_IF_USED 6
+    LOAD_IF_USED 0, %1
+    LOAD_IF_USED 1, %1
+    LOAD_IF_USED 2, %1
+    LOAD_IF_USED 3, %1
+    LOAD_IF_USED 4, %1
+    LOAD_IF_USED 5, %1
+    LOAD_IF_USED 6, %1
+    %if %3
+        picgetgot r%2
+    %endif
+    DEFINE_ARGS %4
+%endmacro
+
+%macro RET 0
+    POP_IF_USED 6
+    POP_IF_USED 5
+    POP_IF_USED 4
+    POP_IF_USED 3
+    ret
+%endmacro
+
+%macro REP_RET 0
+    %if regs_used > 3
+        RET
+    %else
+        rep ret
+    %endif
+%endmacro
+
+%endif ;======================================================================
+
+
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Symbol prefix for C linkage
+%macro cglobal 1-2+
+    %xdefine %1 ff_%1
+    %ifdef PREFIX
+        %xdefine %1 _ %+ %1
+    %endif
+    %ifidn __OUTPUT_FORMAT__,elf
+        global %1:function hidden
+    %else
+        global %1
+    %endif
+    align function_align
+    %1:
+    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+    %if %0 > 1
+        PROLOGUE %2
+    %endif
+%endmacro
+
+%macro cextern 1
+    %ifdef PREFIX
+        extern _%1
+        %define %1 _%1
+    %else
+        extern %1
+    %endif
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+%assign FENC_STRIDE 16
+%assign FDEC_STRIDE 32
+
+; merge mmx and sse*
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0
+    %define RESET_MM_PERMUTATION INIT_MMX
+    %define mmsize 8
+    %define num_mmregs 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnt movntq
+    %assign %%i 0
+    %rep 8
+    CAT_XDEFINE m, %%i, mm %+ %%i
+    CAT_XDEFINE nmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    %rep 8
+    CAT_UNDEF m, %%i
+    CAT_UNDEF nmm, %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro INIT_XMM 0
+    %define RESET_MM_PERMUTATION INIT_XMM
+    %define mmsize 16
+    %define num_mmregs 8
+    %ifdef ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnt movntdq
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, xmm %+ %%i
+    CAT_XDEFINE nxmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
+INIT_MMX
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+    %xdefine tmp%2 m%2
+    %xdefine ntmp%2 nm%2
+    %rotate 2
+%endrep
+%rep %0/2
+    %xdefine m%1 tmp%2
+    %xdefine nm%1 ntmp%2
+    %undef tmp%2
+    %undef ntmp%2
+    %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
+%rep %0-1
+%ifdef m%1
+    %xdefine tmp m%1
+    %xdefine m%1 m%2
+    %xdefine m%2 tmp
+    CAT_XDEFINE n, m%1, %1
+    CAT_XDEFINE n, m%2, %2
+%else
+    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
+    ; Be careful using this mode in nested macros though, as in some cases there may be
+    ; other copies of m# that have already been dereferenced and don't get updated correctly.
+    %xdefine %%n1 n %+ %1
+    %xdefine %%n2 n %+ %2
+    %xdefine tmp m %+ %%n1
+    CAT_XDEFINE m, %%n1, m %+ %%n2
+    CAT_XDEFINE m, %%n2, tmp
+    CAT_XDEFINE n, m %+ %%n1, %%n1
+    CAT_XDEFINE n, m %+ %%n2, %%n2
+%endif
+    %undef tmp
+    %rotate 1
+%endrep
+%endmacro
+
+%macro SAVE_MM_PERMUTATION 1
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE %1_m, %%i, m %+ %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, %1_m %+ %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro call 1
+    call %1
+    %ifdef %1_m0
+        LOAD_MM_PERMUTATION %1
+    %endif
+%endmacro
+
+; substitutions which are functionally identical but reduce code size
+%define movdqa movaps
+%define movdqu movups
+
diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm
new file mode 100644
index 0000000..2e318ef
--- /dev/null
+++ b/libavcodec/x86/x86util.asm
@@ -0,0 +1,240 @@
+;*****************************************************************************
+;* x86inc.asm
+;*****************************************************************************
+;* Copyright (C) 2008 Loren Merritt <lorenm at u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*****************************************************************************
+
+%macro SBUTTERFLY 4
+    mova      m%4, m%2
+    punpckl%1 m%2, m%3
+    punpckh%1 m%4, m%3
+    SWAP %3, %4
+%endmacro
+
+%macro TRANSPOSE4x4W 5
+    SBUTTERFLY wd, %1, %2, %5
+    SBUTTERFLY wd, %3, %4, %5
+    SBUTTERFLY dq, %1, %3, %5
+    SBUTTERFLY dq, %2, %4, %5
+    SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE2x4x4W 5
+    SBUTTERFLY wd,  %1, %2, %5
+    SBUTTERFLY wd,  %3, %4, %5
+    SBUTTERFLY dq,  %1, %3, %5
+    SBUTTERFLY dq,  %2, %4, %5
+    SBUTTERFLY qdq, %1, %2, %5
+    SBUTTERFLY qdq, %3, %4, %5
+%endmacro
+
+%macro TRANSPOSE4x4D 5
+    SBUTTERFLY dq,  %1, %2, %5
+    SBUTTERFLY dq,  %3, %4, %5
+    SBUTTERFLY qdq, %1, %3, %5
+    SBUTTERFLY qdq, %2, %4, %5
+    SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE8x8W 9-11
+%ifdef ARCH_X86_64
+    SBUTTERFLY wd,  %1, %2, %9
+    SBUTTERFLY wd,  %3, %4, %9
+    SBUTTERFLY wd,  %5, %6, %9
+    SBUTTERFLY wd,  %7, %8, %9
+    SBUTTERFLY dq,  %1, %3, %9
+    SBUTTERFLY dq,  %2, %4, %9
+    SBUTTERFLY dq,  %5, %7, %9
+    SBUTTERFLY dq,  %6, %8, %9
+    SBUTTERFLY qdq, %1, %5, %9
+    SBUTTERFLY qdq, %2, %6, %9
+    SBUTTERFLY qdq, %3, %7, %9
+    SBUTTERFLY qdq, %4, %8, %9
+    SWAP %2, %5
+    SWAP %4, %7
+%else
+; in:  m0..m7, unless %11 in which case m6 is in %9
+; out: m0..m7, unless %11 in which case m4 is in %10
+; spills into %9 and %10
+%if %0<11
+    movdqa %9, m%7
+%endif
+    SBUTTERFLY wd,  %1, %2, %7
+    movdqa %10, m%2
+    movdqa m%7, %9
+    SBUTTERFLY wd,  %3, %4, %2
+    SBUTTERFLY wd,  %5, %6, %2
+    SBUTTERFLY wd,  %7, %8, %2
+    SBUTTERFLY dq,  %1, %3, %2
+    movdqa %9, m%3
+    movdqa m%2, %10
+    SBUTTERFLY dq,  %2, %4, %3
+    SBUTTERFLY dq,  %5, %7, %3
+    SBUTTERFLY dq,  %6, %8, %3
+    SBUTTERFLY qdq, %1, %5, %3
+    SBUTTERFLY qdq, %2, %6, %3
+    movdqa %10, m%2
+    movdqa m%3, %9
+    SBUTTERFLY qdq, %3, %7, %2
+    SBUTTERFLY qdq, %4, %8, %2
+    SWAP %2, %5
+    SWAP %4, %7
+%if 0<11
+    movdqa m%5, %10
+%endif
+%endif
+%endmacro
+
+%macro ABS1_MMX 2    ; a, tmp
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+%endmacro
+
+%macro ABS2_MMX 4    ; a, b, tmp0, tmp1
+    pxor    %3, %3
+    pxor    %4, %4
+    psubw   %3, %1
+    psubw   %4, %2
+    pmaxsw  %1, %3
+    pmaxsw  %2, %4
+%endmacro
+
+%macro ABS1_SSSE3 2
+    pabsw   %1, %1
+%endmacro
+
+%macro ABS2_SSSE3 4
+    pabsw   %1, %1
+    pabsw   %2, %2
+%endmacro
+
+%define ABS1 ABS1_MMX
+%define ABS2 ABS2_MMX
+
+%macro ABS4 6
+    ABS2 %1, %2, %5, %6
+    ABS2 %3, %4, %5, %6
+%endmacro
+
+%macro SPLATB_MMX 3
+    movd      %1, [%2-3] ;to avoid crossing a cacheline
+    punpcklbw %1, %1
+%if mmsize==16
+    pshuflw   %1, %1, 0xff
+    punpcklqdq %1, %1
+%else
+    pshufw    %1, %1, 0xff
+%endif
+%endmacro
+
+%macro SPLATB_SSSE3 3
+    movd      %1, [%2-3]
+    pshufb    %1, %3
+%endmacro
+
+%macro PALIGNR_MMX 4
+    %ifnidn %4, %2
+    mova    %4, %2
+    %endif
+    %if mmsize == 8
+    psllq   %1, (8-%3)*8
+    psrlq   %4, %3*8
+    %else
+    pslldq  %1, 16-%3
+    psrldq  %4, %3
+    %endif
+    por     %1, %4
+%endmacro
+
+%macro PALIGNR_SSSE3 4
+    palignr %1, %2, %3
+%endmacro
+
+%macro SUMSUB_BA 2
+    paddw   %1, %2
+    paddw   %2, %2
+    psubw   %2, %1
+%endmacro
+
+%macro SUMSUB_BADC 4
+    paddw   %1, %2
+    paddw   %3, %4
+    paddw   %2, %2
+    paddw   %4, %4
+    psubw   %2, %1
+    psubw   %4, %3
+%endmacro
+
+%macro HADAMARD8_1D 8
+    SUMSUB_BADC %1, %5, %2, %6
+    SUMSUB_BADC %3, %7, %4, %8
+    SUMSUB_BADC %1, %3, %2, %4
+    SUMSUB_BADC %5, %7, %6, %8
+    SUMSUB_BADC %1, %2, %3, %4
+    SUMSUB_BADC %5, %6, %7, %8
+%endmacro
+
+%macro SUMSUB2_AB 3
+    mova    %3, %1
+    paddw   %1, %1
+    paddw   %1, %2
+    psubw   %3, %2
+    psubw   %3, %2
+%endmacro
+
+%macro SUMSUBD2_AB 4
+    mova    %4, %1
+    mova    %3, %2
+    psraw   %2, 1
+    psraw   %4, 1
+    paddw   %1, %2
+    psubw   %4, %3
+%endmacro
+
+%macro LOAD_DIFF 5
+%ifidn %3, none
+    movh       %1, %4
+    movh       %2, %5
+    punpcklbw  %1, %2
+    punpcklbw  %2, %2
+    psubw      %1, %2
+%else
+    movh       %1, %4
+    punpcklbw  %1, %3
+    movh       %2, %5
+    punpcklbw  %2, %3
+    psubw      %1, %2
+%endif
+%endmacro
+
+%macro LOAD_DIFF_8x4P 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer
+    LOAD_DIFF %1, %5, none, [%7],      [%8]
+    LOAD_DIFF %2, %6, none, [%7+r1],   [%8+r3]
+    LOAD_DIFF %3, %5, none, [%7+2*r1], [%8+2*r3]
+    LOAD_DIFF %4, %6, none, [%7+r4],   [%8+r5]
+%endmacro
+
+%macro STORE_DIFF 4
+    psraw      %1, 6
+    movh       %2, %4
+    punpcklbw  %2, %3
+    paddsw     %1, %2
+    packuswb   %1, %1
+    movh       %4, %1
+%endmacro
+
diff --git a/libavcodec/xsubdec.c b/libavcodec/xsubdec.c
index 5f7d8fd..9ac315b 100644
--- a/libavcodec/xsubdec.c
+++ b/libavcodec/xsubdec.c
@@ -80,31 +80,32 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
 
     // allocate sub and set values
     if (!sub->rects) {
-        sub->rects = av_mallocz(sizeof(AVSubtitleRect));
+        sub->rects =  av_mallocz(sizeof(*sub->rects));
+        sub->rects[0] = av_mallocz(sizeof(*sub->rects[0]));
         sub->num_rects = 1;
     }
-    av_freep(&sub->rects[0].bitmap);
-    sub->rects[0].x = x; sub->rects[0].y = y;
-    sub->rects[0].w = w; sub->rects[0].h = h;
-    sub->rects[0].linesize = w;
-    sub->rects[0].bitmap = av_malloc(w * h);
-    sub->rects[0].nb_colors = 4;
-    sub->rects[0].rgba_palette = av_malloc(sub->rects[0].nb_colors * 4);
+    av_freep(&sub->rects[0]->pict.data[0]);
+    sub->rects[0]->x = x; sub->rects[0]->y = y;
+    sub->rects[0]->w = w; sub->rects[0]->h = h;
+    sub->rects[0]->pict.linesize[0] = w;
+    sub->rects[0]->pict.data[0] = av_malloc(w * h);
+    sub->rects[0]->nb_colors = 4;
+    sub->rects[0]->pict.data[1] = av_malloc(sub->rects[0]->nb_colors * 4);
 
     // read palette
-    for (i = 0; i < sub->rects[0].nb_colors; i++)
-        sub->rects[0].rgba_palette[i] = bytestream_get_be24(&buf);
+    for (i = 0; i < sub->rects[0]->nb_colors; i++)
+        ((uint32_t*)sub->rects[0]->pict.data[1])[i] = bytestream_get_be24(&buf);
     // make all except background (first entry) non-transparent
-    for (i = 1; i < sub->rects[0].nb_colors; i++)
-        sub->rects[0].rgba_palette[i] |= 0xff000000;
+    for (i = 1; i < sub->rects[0]->nb_colors; i++)
+        ((uint32_t*)sub->rects[0]->pict.data[1])[i] |= 0xff000000;
 
     // process RLE-compressed data
     rlelen = FFMIN(rlelen, buf_end - buf);
     init_get_bits(&gb, buf, rlelen * 8);
-    bitmap = sub->rects[0].bitmap;
+    bitmap = sub->rects[0]->pict.data[0];
     for (y = 0; y < h; y++) {
         // interlaced: do odd lines
-        if (y == (h + 1) / 2) bitmap = sub->rects[0].bitmap + w;
+        if (y == (h + 1) / 2) bitmap = sub->rects[0]->pict.data[0] + w;
         for (x = 0; x < w; ) {
             int log2 = ff_log2_tab[show_bits(&gb, 8)];
             int run = get_bits(&gb, 14 - 4 * (log2 >> 1));
diff --git a/libavcodec/xvmc_render.h b/libavcodec/xvmc_render.h
index 8db4357..d9aa154 100644
--- a/libavcodec/xvmc_render.h
+++ b/libavcodec/xvmc_render.h
@@ -29,47 +29,47 @@
 #include <X11/extensions/XvMClib.h>
 
 
-//the surface should be shown, video driver manipulates this
+//the surface should be shown, the video driver manipulates this
 #define MP_XVMC_STATE_DISPLAY_PENDING 1
-//the surface is needed for prediction, codec manipulates this
+//the surface is needed for prediction, the codec manipulates this
 #define MP_XVMC_STATE_PREDICTION 2
 //this surface is needed for subpicture rendering
 #define MP_XVMC_STATE_OSD_SOURCE 4
 //                     1337    IDCT MCo
 #define MP_XVMC_RENDER_MAGIC 0x1DC711C0
 
-typedef   struct{
-//these are not changed by the decoder!
-  int  magic;
+struct xvmc_render_state {
+    //these are not changed by the decoder!
+    int  magic;
 
-  short * data_blocks;
-  XvMCMacroBlock * mv_blocks;
-  int total_number_of_mv_blocks;
-  int total_number_of_data_blocks;
-  int mc_type;//XVMC_MPEG1/2/4,XVMC_H263 without XVMC_IDCT
-  int idct;//Do we use IDCT acceleration?
-  int chroma_format;//420,422,444
-  int unsigned_intra;//+-128 for intra pictures after clip
-  XvMCSurface* p_surface;//pointer to rendered surface, never changed
+    short * data_blocks;
+    XvMCMacroBlock * mv_blocks;
+    int total_number_of_mv_blocks;
+    int total_number_of_data_blocks;
+    int mc_type; //XVMC_MPEG1/2/4,XVMC_H263 without XVMC_IDCT
+    int idct; //Do we use IDCT acceleration?
+    int chroma_format; //420, 422, 444
+    int unsigned_intra; //+-128 for intra pictures after clipping
+    XvMCSurface* p_surface; //pointer to rendered surface, never changed
 
-//these are changed by decoder
-//used by XvMCRenderSurface function
-  XvMCSurface* p_past_surface;//pointer to the past surface
-  XvMCSurface* p_future_surface;//pointer to  the future prediction surface
+    //these are changed by the decoder
+    //used by the XvMCRenderSurface function
+    XvMCSurface* p_past_surface; //pointer to the past surface
+    XvMCSurface* p_future_surface; //pointer to the future prediction surface
 
-  unsigned int picture_structure;//top/bottom fields or frame!
-  unsigned int flags;//XVMC_SECOND_FIELD - 1'st or 2'd field in the sequence
-  unsigned int display_flags; //1,2 or 1+2 fields for XvMCPutSurface,
+    unsigned int picture_structure; //top/bottom fields or frame!
+    unsigned int flags; //XVMC_SECOND_FIELD - 1st or 2nd field in the sequence
+    unsigned int display_flags; //1,2 or 1+2 fields for XvMCPutSurface
 
-//these are internal communication ones
-  int state;//0-free, 1 Waiting to Display, 2 Waiting for prediction
-  int start_mv_blocks_num;//offset in the array for the current slice, updated by vo
-  int filled_mv_blocks_num;//processed mv block in this slice, changed by decoder
+    //these are for internal communication
+    int state; //0 - free, 1 - waiting to display, 2 - waiting for prediction
+    int start_mv_blocks_num; //offset in the array for the current slice, updated by vo
+    int filled_mv_blocks_num; //processed mv block in this slice, changed by decoder
 
-  int next_free_data_block_num;//used in add_mv_block, pointer to next free block
-//extensions
-  void * p_osd_target_surface_render;//pointer to the surface where subpicture is rendered
+    int next_free_data_block_num; //used in add_mv_block, pointer to next free block
+    //extensions
+    void * p_osd_target_surface_render; //pointer to the surface where subpicture is rendered
 
-} xvmc_render_state_t;
+};
 
 #endif /* AVCODEC_XVMC_RENDER_H */
diff --git a/libavcodec/xvmcvideo.c b/libavcodec/xvmcvideo.c
index 93d93a6..9c4d8d8 100644
--- a/libavcodec/xvmcvideo.c
+++ b/libavcodec/xvmcvideo.c
@@ -39,8 +39,8 @@
 
 //set s->block
 void XVMC_init_block(MpegEncContext *s){
-xvmc_render_state_t * render;
-    render = (xvmc_render_state_t*)s->current_picture.data[2];
+    struct xvmc_render_state * render;
+    render = (struct xvmc_render_state*)s->current_picture.data[2];
     assert(render != NULL);
     if( (render == NULL) || (render->magic != MP_XVMC_RENDER_MAGIC) ){
         assert(0);
@@ -69,11 +69,11 @@ const int mb_block_count = 4+(1<<s->chroma_format);
 //These functions should be called on every new field and/or frame.
 //They should be safe if they are called a few times for the same field!
 int XVMC_field_start(MpegEncContext*s, AVCodecContext *avctx){
-xvmc_render_state_t * render,* last, * next;
+    struct xvmc_render_state * render, * last, * next;
 
     assert(avctx != NULL);
 
-    render = (xvmc_render_state_t*)s->current_picture.data[2];
+    render = (struct xvmc_render_state*)s->current_picture.data[2];
     assert(render != NULL);
     if( (render == NULL) || (render->magic != MP_XVMC_RENDER_MAGIC) )
         return -1;//make sure that this is render packet
@@ -91,7 +91,7 @@ xvmc_render_state_t * render,* last, * next;
         case  FF_I_TYPE:
             return 0;// no prediction from other frames
         case  FF_B_TYPE:
-            next = (xvmc_render_state_t*)s->next_picture.data[2];
+            next = (struct xvmc_render_state*)s->next_picture.data[2];
             assert(next!=NULL);
             assert(next->state & MP_XVMC_STATE_PREDICTION);
             if(next == NULL) return -1;
@@ -99,7 +99,7 @@ xvmc_render_state_t * render,* last, * next;
             render->p_future_surface = next->p_surface;
             //no return here, going to set forward prediction
         case  FF_P_TYPE:
-            last = (xvmc_render_state_t*)s->last_picture.data[2];
+            last = (struct xvmc_render_state*)s->last_picture.data[2];
             if(last == NULL)// && !s->first_field)
                 last = render;//predict second field from the first
             if(last->magic != MP_XVMC_RENDER_MAGIC) return -1;
@@ -112,8 +112,8 @@ return -1;
 }
 
 void XVMC_field_end(MpegEncContext *s){
-xvmc_render_state_t * render;
-    render = (xvmc_render_state_t*)s->current_picture.data[2];
+    struct xvmc_render_state * render;
+    render = (struct xvmc_render_state*)s->current_picture.data[2];
     assert(render != NULL);
 
     if(render->filled_mv_blocks_num > 0){
@@ -124,7 +124,7 @@ xvmc_render_state_t * render;
 
 void XVMC_decode_mb(MpegEncContext *s){
 XvMCMacroBlock * mv_block;
-xvmc_render_state_t * render;
+struct xvmc_render_state * render;
 int i,cbp,blocks_per_mb;
 
 const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
@@ -152,7 +152,7 @@ const int mb_xy = s->mb_y * s->mb_stride + s->mb_x;
     s->current_picture.qscale_table[mb_xy] = s->qscale;
 
 //START OF XVMC specific code
-    render = (xvmc_render_state_t*)s->current_picture.data[2];
+    render = (struct xvmc_render_state*)s->current_picture.data[2];
     assert(render!=NULL);
     assert(render->magic==MP_XVMC_RENDER_MAGIC);
     assert(render->mv_blocks);
diff --git a/libavdevice/bktr.c b/libavdevice/bktr.c
index 7b37f11..ea9a6e2 100644
--- a/libavdevice/bktr.c
+++ b/libavdevice/bktr.c
@@ -24,6 +24,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define _BSD_SOURCE 1
 #include "libavformat/avformat.h"
 #if defined (HAVE_DEV_BKTR_IOCTL_METEOR_H) && defined (HAVE_DEV_BKTR_IOCTL_BT848_H)
 # include <dev/bktr/ioctl_meteor.h>
diff --git a/libavdevice/v4l.c b/libavdevice/v4l.c
index a469ae0..c2c67d0 100644
--- a/libavdevice/v4l.c
+++ b/libavdevice/v4l.c
@@ -21,6 +21,7 @@
 
 #undef __STRICT_ANSI__ //workaround due to broken kernel headers
 #include "config.h"
+#include "libavutil/rational.h"
 #include "libavformat/avformat.h"
 #include "libavcodec/dsputil.h"
 #include <unistd.h>
@@ -37,13 +38,12 @@ typedef struct {
     int fd;
     int frame_format; /* see VIDEO_PALETTE_xxx */
     int use_mmap;
-    int width, height;
-    int frame_rate;
-    int frame_rate_base;
+    AVRational time_base;
     int64_t time_frame;
     int frame_size;
     struct video_capability video_cap;
     struct video_audio audio_saved;
+    struct video_window video_win;
     uint8_t *video_buf;
     struct video_mbuf gb_buffers;
     struct video_mmap gb_buf;
@@ -70,9 +70,7 @@ static int grab_read_header(AVFormatContext *s1, AVFormatParameters *ap)
 {
     VideoData *s = s1->priv_data;
     AVStream *st;
-    int width, height;
-    int video_fd, frame_size;
-    int ret, frame_rate, frame_rate_base;
+    int video_fd;
     int desired_palette, desired_depth;
     struct video_tuner tuner;
     struct video_audio audio;
@@ -80,44 +78,27 @@ static int grab_read_header(AVFormatContext *s1, AVFormatParameters *ap)
     int j;
     int vformat_num = FF_ARRAY_ELEMS(video_formats);
 
-    if (ap->width <= 0 || ap->height <= 0) {
-        av_log(s1, AV_LOG_ERROR, "Wrong size (%dx%d)\n", ap->width, ap->height);
-        return -1;
-    }
     if (ap->time_base.den <= 0) {
         av_log(s1, AV_LOG_ERROR, "Wrong time base (%d)\n", ap->time_base.den);
         return -1;
     }
+    s->time_base = ap->time_base;
 
-    width = ap->width;
-    height = ap->height;
-    frame_rate      = ap->time_base.den;
-    frame_rate_base = ap->time_base.num;
-
-    if((unsigned)width > 32767 || (unsigned)height > 32767) {
-        av_log(s1, AV_LOG_ERROR, "Capture size is out of range: %dx%d\n",
-            width, height);
-
-        return -1;
-    }
+    s->video_win.width = ap->width;
+    s->video_win.height = ap->height;
 
     st = av_new_stream(s1, 0);
     if (!st)
         return AVERROR(ENOMEM);
     av_set_pts_info(st, 64, 1, 1000000); /* 64 bits pts in us */
 
-    s->width = width;
-    s->height = height;
-    s->frame_rate      = frame_rate;
-    s->frame_rate_base = frame_rate_base;
-
     video_fd = open(s1->filename, O_RDWR);
     if (video_fd < 0) {
         av_log(s1, AV_LOG_ERROR, "%s: %s\n", s1->filename, strerror(errno));
         goto fail;
     }
 
-    if (ioctl(video_fd,VIDIOCGCAP, &s->video_cap) < 0) {
+    if (ioctl(video_fd, VIDIOCGCAP, &s->video_cap) < 0) {
         av_log(s1, AV_LOG_ERROR, "VIDIOCGCAP: %s\n", strerror(errno));
         goto fail;
     }
@@ -127,6 +108,17 @@ static int grab_read_header(AVFormatContext *s1, AVFormatParameters *ap)
         goto fail;
     }
 
+    /* no values set, autodetect them */
+    if (s->video_win.width <= 0 || s->video_win.height <= 0) {
+        if (ioctl(video_fd, VIDIOCGWIN, &s->video_win, sizeof(s->video_win)) < 0) {
+            av_log(s1, AV_LOG_ERROR, "VIDIOCGWIN: %s\n", strerror(errno));
+            goto fail;
+        }
+    }
+
+    if(avcodec_check_dimensions(s1, s->video_win.width, s->video_win.height) < 0)
+        return -1;
+
     desired_palette = -1;
     desired_depth = -1;
     for (j = 0; j < vformat_num; j++) {
@@ -167,7 +159,7 @@ static int grab_read_header(AVFormatContext *s1, AVFormatParameters *ap)
     /* try to choose a suitable video format */
     pict.palette = desired_palette;
     pict.depth= desired_depth;
-    if (desired_palette == -1 || (ret = ioctl(video_fd, VIDIOCSPICT, &pict)) < 0) {
+    if (desired_palette == -1 || ioctl(video_fd, VIDIOCSPICT, &pict) < 0) {
         for (j = 0; j < vformat_num; j++) {
             pict.palette = video_formats[j].palette;
             pict.depth = video_formats[j].depth;
@@ -178,53 +170,54 @@ static int grab_read_header(AVFormatContext *s1, AVFormatParameters *ap)
             goto fail1;
     }
 
-    ret = ioctl(video_fd,VIDIOCGMBUF,&s->gb_buffers);
-    if (ret < 0) {
+    if (ioctl(video_fd, VIDIOCGMBUF, &s->gb_buffers) < 0) {
         /* try to use read based access */
-        struct video_window win;
         int val;
 
-        win.x = 0;
-        win.y = 0;
-        win.width = width;
-        win.height = height;
-        win.chromakey = -1;
-        win.flags = 0;
+        s->video_win.x = 0;
+        s->video_win.y = 0;
+        s->video_win.chromakey = -1;
+        s->video_win.flags = 0;
 
-        ioctl(video_fd, VIDIOCSWIN, &win);
+        if (ioctl(video_fd, VIDIOCSWIN, s->video_win) < 0) {
+            av_log(s1, AV_LOG_ERROR, "VIDIOCSWIN: %s\n", strerror(errno));
+            goto fail;
+        }
 
         s->frame_format = pict.palette;
 
         val = 1;
-        ioctl(video_fd, VIDIOCCAPTURE, &val);
+        if (ioctl(video_fd, VIDIOCCAPTURE, &val) < 0) {
+            av_log(s1, AV_LOG_ERROR, "VIDIOCCAPTURE: %s\n", strerror(errno));
+            goto fail;
+        }
 
-        s->time_frame = av_gettime() * s->frame_rate / s->frame_rate_base;
+        s->time_frame = av_gettime() * s->time_base.den / s->time_base.num;
         s->use_mmap = 0;
     } else {
-        s->video_buf = mmap(0,s->gb_buffers.size,PROT_READ|PROT_WRITE,MAP_SHARED,video_fd,0);
+        s->video_buf = mmap(0, s->gb_buffers.size, PROT_READ|PROT_WRITE, MAP_SHARED, video_fd, 0);
         if ((unsigned char*)-1 == s->video_buf) {
-            s->video_buf = mmap(0,s->gb_buffers.size,PROT_READ|PROT_WRITE,MAP_PRIVATE,video_fd,0);
+            s->video_buf = mmap(0, s->gb_buffers.size, PROT_READ|PROT_WRITE, MAP_PRIVATE, video_fd, 0);
             if ((unsigned char*)-1 == s->video_buf) {
                 av_log(s1, AV_LOG_ERROR, "mmap: %s\n", strerror(errno));
                 goto fail;
             }
         }
         s->gb_frame = 0;
-        s->time_frame = av_gettime() * s->frame_rate / s->frame_rate_base;
+        s->time_frame = av_gettime() * s->time_base.den / s->time_base.num;
 
         /* start to grab the first frame */
         s->gb_buf.frame = s->gb_frame % s->gb_buffers.frames;
-        s->gb_buf.height = height;
-        s->gb_buf.width = width;
+        s->gb_buf.height = s->video_win.height;
+        s->gb_buf.width = s->video_win.width;
         s->gb_buf.format = pict.palette;
 
-        ret = ioctl(video_fd, VIDIOCMCAPTURE, &s->gb_buf);
-        if (ret < 0) {
+        if (ioctl(video_fd, VIDIOCMCAPTURE, &s->gb_buf) < 0) {
             if (errno != EAGAIN) {
             fail1:
-                av_log(s1, AV_LOG_ERROR, "Fatal: grab device does not support suitable format\n");
+                av_log(s1, AV_LOG_ERROR, "VIDIOCMCAPTURE: %s\n", strerror(errno));
             } else {
-                av_log(s1, AV_LOG_ERROR,"Fatal: grab device does not receive any video signal\n");
+                av_log(s1, AV_LOG_ERROR, "Fatal: grab device does not receive any video signal\n");
             }
             goto fail;
         }
@@ -238,7 +231,7 @@ static int grab_read_header(AVFormatContext *s1, AVFormatParameters *ap)
 
     for (j = 0; j < vformat_num; j++) {
         if (s->frame_format == video_formats[j].palette) {
-            frame_size = width * height * video_formats[j].depth / 8;
+            s->frame_size = s->video_win.width * s->video_win.height * video_formats[j].depth / 8;
             st->codec->pix_fmt = video_formats[j].pix_fmt;
             break;
         }
@@ -248,15 +241,13 @@ static int grab_read_header(AVFormatContext *s1, AVFormatParameters *ap)
         goto fail;
 
     s->fd = video_fd;
-    s->frame_size = frame_size;
 
     st->codec->codec_type = CODEC_TYPE_VIDEO;
     st->codec->codec_id = CODEC_ID_RAWVIDEO;
-    st->codec->width = width;
-    st->codec->height = height;
-    st->codec->time_base.den      = frame_rate;
-    st->codec->time_base.num = frame_rate_base;
-    st->codec->bit_rate = frame_size * 1/av_q2d(st->codec->time_base) * 8;
+    st->codec->width = s->video_win.width;
+    st->codec->height = s->video_win.height;
+    st->codec->time_base = s->time_base;
+    st->codec->bit_rate = s->frame_size * 1/av_q2d(st->codec->time_base) * 8;
 
     return 0;
  fail:
@@ -303,9 +294,9 @@ static int grab_read_packet(AVFormatContext *s1, AVPacket *pkt)
     /* wait based on the frame rate */
     for(;;) {
         curtime = av_gettime();
-        delay = s->time_frame  * s->frame_rate_base / s->frame_rate - curtime;
+        delay = s->time_frame * s->time_base.num / s->time_base.den - curtime;
         if (delay <= 0) {
-            if (delay < INT64_C(-1000000) * s->frame_rate_base / s->frame_rate) {
+            if (delay < INT64_C(-1000000) * s->time_base.num / s->time_base.den) {
                 /* printf("grabbing is %d frames late (dropping)\n", (int) -(delay / 16666)); */
                 s->time_frame += INT64_C(1000000);
             }
@@ -349,7 +340,7 @@ static int grab_read_close(AVFormatContext *s1)
 
 AVInputFormat v4l_demuxer = {
     "video4linux",
-    NULL_IF_CONFIG_SMALL("video grab"),
+    NULL_IF_CONFIG_SMALL("Video4Linux device grab"),
     sizeof(VideoData),
     NULL,
     grab_read_header,
diff --git a/libavdevice/v4l2.c b/libavdevice/v4l2.c
index dc0a22d..1efed83 100644
--- a/libavdevice/v4l2.c
+++ b/libavdevice/v4l2.c
@@ -57,8 +57,6 @@ struct video_data {
     int frame_format; /* V4L2_PIX_FMT_* */
     enum io_method io_method;
     int width, height;
-    int frame_rate;
-    int frame_rate_base;
     int frame_size;
     int top_field_first;
 
@@ -509,28 +507,19 @@ static int v4l2_read_header(AVFormatContext *s1, AVFormatParameters *ap)
     struct video_data *s = s1->priv_data;
     AVStream *st;
     int width, height;
-    int res, frame_rate, frame_rate_base;
+    int res;
     uint32_t desired_format, capabilities;
 
     if (ap->width <= 0 || ap->height <= 0) {
         av_log(s1, AV_LOG_ERROR, "Wrong size (%dx%d)\n", ap->width, ap->height);
         return -1;
     }
-    if (ap->time_base.den <= 0) {
-        av_log(s1, AV_LOG_ERROR, "Wrong time base (%d)\n", ap->time_base.den);
-        return -1;
-    }
 
     width = ap->width;
     height = ap->height;
-    frame_rate = ap->time_base.den;
-    frame_rate_base = ap->time_base.num;
-
-    if((unsigned)width > 32767 || (unsigned)height > 32767) {
-        av_log(s1, AV_LOG_ERROR, "Wrong size (%dx%d)\n", width, height);
 
+    if(avcodec_check_dimensions(s1, ap->width, ap->height) < 0)
         return -1;
-    }
 
     st = av_new_stream(s1, 0);
     if (!st) {
@@ -540,8 +529,6 @@ static int v4l2_read_header(AVFormatContext *s1, AVFormatParameters *ap)
 
     s->width = width;
     s->height = height;
-    s->frame_rate      = frame_rate;
-    s->frame_rate_base = frame_rate_base;
 
     capabilities = 0;
     s->fd = device_open(s1, &capabilities);
@@ -602,8 +589,8 @@ static int v4l2_read_header(AVFormatContext *s1, AVFormatParameters *ap)
     st->codec->codec_id = CODEC_ID_RAWVIDEO;
     st->codec->width = width;
     st->codec->height = height;
-    st->codec->time_base.den = frame_rate;
-    st->codec->time_base.num = frame_rate_base;
+    st->codec->time_base.den = ap->time_base.den;
+    st->codec->time_base.num = ap->time_base.num;
     st->codec->bit_rate = s->frame_size * 1/av_q2d(st->codec->time_base) * 8;
 
     return 0;
@@ -651,7 +638,7 @@ static int v4l2_read_close(AVFormatContext *s1)
 
 AVInputFormat v4l2_demuxer = {
     "video4linux2",
-    NULL_IF_CONFIG_SMALL("video grab"),
+    NULL_IF_CONFIG_SMALL("Video4Linux2 device grab"),
     sizeof(struct video_data),
     NULL,
     v4l2_read_header,
diff --git a/libavdevice/vfwcap.c b/libavdevice/vfwcap.c
index 078c186..83e499c 100644
--- a/libavdevice/vfwcap.c
+++ b/libavdevice/vfwcap.c
@@ -27,18 +27,6 @@
 
 /* Defines for VFW missing from MinGW.
  * Remove this when MinGW incorporates them. */
-#define WM_CAP_START                (0x0400)
-#define WM_CAP_SET_CALLBACK_VIDEOSTREAM (WM_CAP_START + 6)
-#define WM_CAP_DRIVER_CONNECT       (WM_CAP_START + 10)
-#define WM_CAP_DRIVER_DISCONNECT    (WM_CAP_START + 11)
-#define WM_CAP_GET_VIDEOFORMAT      (WM_CAP_START + 44)
-#define WM_CAP_SET_VIDEOFORMAT      (WM_CAP_START + 45)
-#define WM_CAP_SET_PREVIEW          (WM_CAP_START + 50)
-#define WM_CAP_SET_OVERLAY          (WM_CAP_START + 51)
-#define WM_CAP_SEQUENCE_NOFILE      (WM_CAP_START + 63)
-#define WM_CAP_SET_SEQUENCE_SETUP   (WM_CAP_START + 64)
-#define WM_CAP_GET_SEQUENCE_SETUP   (WM_CAP_START + 65)
-
 #define HWND_MESSAGE                ((HWND)-3)
 
 #define BI_RGB                      0
diff --git a/libavdevice/x11grab.c b/libavdevice/x11grab.c
index 4e1a499..8f37f28 100644
--- a/libavdevice/x11grab.c
+++ b/libavdevice/x11grab.c
@@ -55,7 +55,7 @@
 /**
  * X11 Device Demuxer context
  */
-typedef struct x11_grab_s
+struct x11_grab
 {
     int frame_size;          /**< Size in bytes of a grabbed frame */
     AVRational time_base;    /**< Time base */
@@ -71,7 +71,7 @@ typedef struct x11_grab_s
     int use_shm;             /**< !0 when using XShm extension */
     XShmSegmentInfo shminfo; /**< When using XShm, keeps track of XShm infos */
     int mouse_warning_shown;
-} x11_grab_t;
+};
 
 /**
  * Initializes the x11 grab device demuxer (public device demuxer API).
@@ -87,7 +87,7 @@ typedef struct x11_grab_s
 static int
 x11grab_read_header(AVFormatContext *s1, AVFormatParameters *ap)
 {
-    x11_grab_t *x11grab = s1->priv_data;
+    struct x11_grab *x11grab = s1->priv_data;
     Display *dpy;
     AVStream *st = NULL;
     int input_pixfmt;
@@ -259,7 +259,7 @@ get_pointer_coordinates(int *x, int *y, Display *dpy, AVFormatContext *s1)
     if (XQueryPointer(dpy, mrootwindow, &mrootwindow, &childwindow,
                       x, y, &dummy, &dummy, (unsigned int*)&dummy)) {
     } else {
-        x11_grab_t *s = s1->priv_data;
+        struct x11_grab *s = s1->priv_data;
         if (!s->mouse_warning_shown) {
             av_log(s1, AV_LOG_INFO, "couldn't find mouse pointer\n");
             s->mouse_warning_shown = 1;
@@ -306,7 +306,7 @@ apply_masks(uint8_t *dst, int and, int or, int bits_per_pixel)
  * @param y Mouse pointer coordinate
  */
 static void
-paint_mouse_pointer(XImage *image, x11_grab_t *s, int x, int y)
+paint_mouse_pointer(XImage *image, struct x11_grab *s, int x, int y)
 {
     /* 16x20x1bpp bitmap for the black channel of the mouse pointer */
     static const uint16_t const mousePointerBlack[] =
@@ -431,7 +431,7 @@ xget_zpixmap(Display *dpy, Drawable d, XImage *image, int x, int y)
 static int
 x11grab_read_packet(AVFormatContext *s1, AVPacket *pkt)
 {
-    x11_grab_t *s = s1->priv_data;
+    struct x11_grab *s = s1->priv_data;
     Display *dpy = s->dpy;
     XImage *image = s->image;
     int x_off = s->x_off;
@@ -495,7 +495,7 @@ x11grab_read_packet(AVFormatContext *s1, AVPacket *pkt)
 static int
 x11grab_read_close(AVFormatContext *s1)
 {
-    x11_grab_t *x11grab = s1->priv_data;
+    struct x11_grab *x11grab = s1->priv_data;
 
     /* Detach cleanly from shared mem */
     if (x11grab->use_shm) {
@@ -520,7 +520,7 @@ AVInputFormat x11_grab_device_demuxer =
 {
     "x11grab",
     NULL_IF_CONFIG_SMALL("X11grab"),
-    sizeof(x11_grab_t),
+    sizeof(struct x11_grab),
     NULL,
     x11grab_read_header,
     x11grab_read_packet,
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 1cd4992..9da8f91 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -5,6 +5,8 @@ FFLIBS = avcodec avutil
 FFLIBS-$(CONFIG_SWSCALE)       += swscale
 FFLIBS-$(CONFIG_AVFILTER_LAVF) += avformat
 
+HEADERS = avfilter.h
+
 OBJS = allfilters.o                                                     \
        avfilter.o                                                       \
        defaults.o                                                       \
@@ -12,6 +14,4 @@ OBJS = allfilters.o                                                     \
 
 #OBJS-$(CONFIG_XXX_FILTER)    += vf_xxx.o
 
-HEADERS = avfilter.h
-
 include $(SUBDIR)../subdir.mak
diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
index d6f07a9..218a83b 100644
--- a/libavfilter/avfilter.c
+++ b/libavfilter/avfilter.c
@@ -90,7 +90,7 @@ int avfilter_link(AVFilterContext *src, unsigned srcpad,
     link->dst     = dst;
     link->srcpad  = srcpad;
     link->dstpad  = dstpad;
-    link->format  = -1;
+    link->format  = PIX_FMT_NONE;
 
     return 0;
 }
diff --git a/libavfilter/avfilter.h b/libavfilter/avfilter.h
index ff08f71..75d8c76 100644
--- a/libavfilter/avfilter.h
+++ b/libavfilter/avfilter.h
@@ -257,7 +257,7 @@ struct AVFilterPad
     enum CodecType type;
 
     /**
-     * Minimum required permissions on incoming buffers.  Any buffers with
+     * Minimum required permissions on incoming buffers.  Any buffer with
      * insufficient permissions will be automatically copied by the filter
      * system to a new buffer which provides the needed access permissions.
      *
@@ -267,9 +267,10 @@ struct AVFilterPad
 
     /**
      * Permissions which are not accepted on incoming buffers.  Any buffer
-     * which has any of these permissions set be automatically copied by the
-     * filter system to a new buffer which does not have those permissions.
-     * This can be used to easily disallow buffers with AV_PERM_REUSE.
+     * which has any of these permissions set will be automatically copied
+     * by the filter system to a new buffer which does not have those
+     * permissions.  This can be used to easily disallow buffers with
+     * AV_PERM_REUSE.
      *
      * Input pads only.
      */
@@ -509,7 +510,8 @@ int avfilter_request_frame(AVFilterLink *link);
 /**
  * Poll a frame from the filter chain.
  * @param  link the input link
- * @return      the number of imediately available frames
+ * @return the number of immediately available frames, a negative
+ * number in case of error
  */
 int avfilter_poll_frame(AVFilterLink *link);
 
diff --git a/libavfilter/avfiltergraph.h b/libavfilter/avfiltergraph.h
index 0558de3..dee8a90 100644
--- a/libavfilter/avfiltergraph.h
+++ b/libavfilter/avfiltergraph.h
@@ -30,7 +30,10 @@ typedef struct AVFilterGraph {
 } AVFilterGraph;
 
 /**
- * Get a pointer to a graph by instance name
+ * Get from \p graph a filter instance with name \p name.
+ *
+ * @return the pointer to the found filter instance or NULL if it
+ * cannot be found.
  */
 AVFilterContext *avfilter_graph_get_filter(AVFilterGraph *graph, char *name);
 
diff --git a/libavfilter/graphparser.c b/libavfilter/graphparser.c
index 25c5c4d..e7019ea 100644
--- a/libavfilter/graphparser.c
+++ b/libavfilter/graphparser.c
@@ -119,43 +119,43 @@ static char *parse_link_name(const char **buf, AVClass *log_ctx)
 }
 
 static AVFilterContext *create_filter(AVFilterGraph *ctx, int index,
-                                      const char *name, const char *args,
+                                      const char *filt_name, const char *args,
                                       AVClass *log_ctx)
 {
-    AVFilterContext *filt;
+    AVFilterContext *filt_ctx;
 
-    AVFilter *filterdef;
+    AVFilter *filt;
     char inst_name[30];
 
     snprintf(inst_name, sizeof(inst_name), "Parsed filter %d", index);
 
-    filterdef = avfilter_get_by_name(name);
+    filt = avfilter_get_by_name(filt_name);
 
-    if(!filterdef) {
+    if(!filt) {
         av_log(log_ctx, AV_LOG_ERROR,
-               "no such filter: '%s'\n", name);
+               "no such filter: '%s'\n", filt_name);
         return NULL;
     }
 
-    filt = avfilter_open(filterdef, inst_name);
-    if(!filt) {
+    filt_ctx = avfilter_open(filt, inst_name);
+    if(!filt_ctx) {
         av_log(log_ctx, AV_LOG_ERROR,
-               "error creating filter '%s'\n", name);
+               "error creating filter '%s'\n", filt_name);
         return NULL;
     }
 
-    if(avfilter_graph_add_filter(ctx, filt) < 0) {
-        avfilter_destroy(filt);
+    if(avfilter_graph_add_filter(ctx, filt_ctx) < 0) {
+        avfilter_destroy(filt_ctx);
         return NULL;
     }
 
-    if(avfilter_init_filter(filt, args, NULL)) {
+    if(avfilter_init_filter(filt_ctx, args, NULL)) {
         av_log(log_ctx, AV_LOG_ERROR,
-               "error initializing filter '%s' with args '%s'\n", name, args);
+               "error initializing filter '%s' with args '%s'\n", filt_name, args);
         return NULL;
     }
 
-    return filt;
+    return filt_ctx;
 }
 
 /**
@@ -211,13 +211,13 @@ static void insert_inout(AVFilterInOut **inouts, AVFilterInOut *element)
 }
 
 static int link_filter_inouts(AVFilterContext *filter,
-                              AVFilterInOut **currInputs,
-                              AVFilterInOut **openInputs, AVClass *log_ctx)
+                              AVFilterInOut **curr_inputs,
+                              AVFilterInOut **open_inputs, AVClass *log_ctx)
 {
     int pad = filter->input_count;
 
     while(pad--) {
-        AVFilterInOut *p = *currInputs;
+        AVFilterInOut *p = *curr_inputs;
         if(!p) {
             av_log(log_ctx, AV_LOG_ERROR,
                    "Not enough inputs specified for the \"%s\" filter.\n",
@@ -225,7 +225,7 @@ static int link_filter_inouts(AVFilterContext *filter,
             return -1;
         }
 
-        *currInputs = (*currInputs)->next;
+        *curr_inputs = (*curr_inputs)->next;
 
         if(p->filter) {
             if(link_filter(p->filter, p->pad_idx, filter, pad, log_ctx))
@@ -235,11 +235,11 @@ static int link_filter_inouts(AVFilterContext *filter,
         } else {
             p->filter = filter;
             p->pad_idx = pad;
-            insert_inout(openInputs, p);
+            insert_inout(open_inputs, p);
         }
     }
 
-    if(*currInputs) {
+    if(*curr_inputs) {
         av_log(log_ctx, AV_LOG_ERROR,
                "Too many inputs specified for the \"%s\" filter.\n",
                filter->filter->name);
@@ -251,14 +251,14 @@ static int link_filter_inouts(AVFilterContext *filter,
         AVFilterInOut *currlinkn = av_mallocz(sizeof(AVFilterInOut));
         currlinkn->filter  = filter;
         currlinkn->pad_idx = pad;
-        insert_inout(currInputs, currlinkn);
+        insert_inout(curr_inputs, currlinkn);
     }
 
     return 0;
 }
 
-static int parse_inputs(const char **buf, AVFilterInOut **currInputs,
-                        AVFilterInOut **openOutputs, AVClass *log_ctx)
+static int parse_inputs(const char **buf, AVFilterInOut **curr_inputs,
+                        AVFilterInOut **open_outputs, AVClass *log_ctx)
 {
     int pad = 0;
 
@@ -269,8 +269,8 @@ static int parse_inputs(const char **buf, AVFilterInOut **currInputs,
         if(!name)
             return -1;
 
-        /* First check if the label is not in the openOutputs list */
-        match = extract_inout(name, openOutputs);
+        /* First check if the label is not in the open_outputs list */
+        match = extract_inout(name, open_outputs);
 
         if(match) {
             av_free(name);
@@ -281,7 +281,7 @@ static int parse_inputs(const char **buf, AVFilterInOut **currInputs,
             match->pad_idx = pad;
         }
 
-        insert_inout(currInputs, match);
+        insert_inout(curr_inputs, match);
 
         *buf += consume_whitespace(*buf);
         pad++;
@@ -290,9 +290,9 @@ static int parse_inputs(const char **buf, AVFilterInOut **currInputs,
     return pad;
 }
 
-static int parse_outputs(const char **buf, AVFilterInOut **currInputs,
-                         AVFilterInOut **openInputs,
-                         AVFilterInOut **openOutputs, AVClass *log_ctx)
+static int parse_outputs(const char **buf, AVFilterInOut **curr_inputs,
+                         AVFilterInOut **open_inputs,
+                         AVFilterInOut **open_outputs, AVClass *log_ctx)
 {
     int pad = 0;
 
@@ -300,14 +300,14 @@ static int parse_outputs(const char **buf, AVFilterInOut **currInputs,
         char *name = parse_link_name(buf, log_ctx);
         AVFilterInOut *match;
 
-        AVFilterInOut *input = *currInputs;
-        *currInputs = (*currInputs)->next;
+        AVFilterInOut *input = *curr_inputs;
+        *curr_inputs = (*curr_inputs)->next;
 
         if(!name)
             return -1;
 
-        /* First check if the label is not in the openInputs list */
-        match = extract_inout(name, openInputs);
+        /* First check if the label is not in the open_inputs list */
+        match = extract_inout(name, open_inputs);
 
         if(match) {
             if(link_filter(input->filter, input->pad_idx,
@@ -318,9 +318,9 @@ static int parse_outputs(const char **buf, AVFilterInOut **currInputs,
             av_free(match);
             av_free(input);
         } else {
-            /* Not in the list, so add the first input as a openOutput */
+            /* Not in the list, so add the first input as a open_output */
             input->name = name;
-            insert_inout(openOutputs, input);
+            insert_inout(open_outputs, input);
         }
         *buf += consume_whitespace(*buf);
         pad++;
@@ -330,19 +330,19 @@ static int parse_outputs(const char **buf, AVFilterInOut **currInputs,
 }
 
 int avfilter_parse_graph(AVFilterGraph *graph, const char *filters,
-                         AVFilterInOut *openInputs,
-                         AVFilterInOut *openOutputs, AVClass *log_ctx)
+                         AVFilterInOut *open_inputs,
+                         AVFilterInOut *open_outputs, AVClass *log_ctx)
 {
     int index = 0;
     char chr = 0;
 
-    AVFilterInOut *currInputs = NULL;
+    AVFilterInOut *curr_inputs = NULL;
 
     do {
         AVFilterContext *filter;
         filters += consume_whitespace(filters);
 
-        if(parse_inputs(&filters, &currInputs, &openOutputs, log_ctx) < 0)
+        if(parse_inputs(&filters, &curr_inputs, &open_outputs, log_ctx) < 0)
             goto fail;
 
         filter = parse_filter(&filters, graph, index, log_ctx);
@@ -350,24 +350,24 @@ int avfilter_parse_graph(AVFilterGraph *graph, const char *filters,
         if(!filter)
             goto fail;
 
-        if(filter->input_count == 1 && !currInputs && !index) {
-            /* First input can be ommitted if it is "[in]" */
+        if(filter->input_count == 1 && !curr_inputs && !index) {
+            /* First input can be omitted if it is "[in]" */
             const char *tmp = "[in]";
-            if(parse_inputs(&tmp, &currInputs, &openOutputs, log_ctx) < 0)
+            if(parse_inputs(&tmp, &curr_inputs, &open_outputs, log_ctx) < 0)
                 goto fail;
         }
 
-        if(link_filter_inouts(filter, &currInputs, &openInputs, log_ctx) < 0)
+        if(link_filter_inouts(filter, &curr_inputs, &open_inputs, log_ctx) < 0)
             goto fail;
 
-        if(parse_outputs(&filters, &currInputs, &openInputs, &openOutputs,
+        if(parse_outputs(&filters, &curr_inputs, &open_inputs, &open_outputs,
                          log_ctx) < 0)
             goto fail;
 
         filters += consume_whitespace(filters);
         chr = *filters++;
 
-        if(chr == ';' && currInputs) {
+        if(chr == ';' && curr_inputs) {
             av_log(log_ctx, AV_LOG_ERROR,
                    "Could not find a output to link when parsing \"%s\"\n",
                    filters - 1);
@@ -376,11 +376,11 @@ int avfilter_parse_graph(AVFilterGraph *graph, const char *filters,
         index++;
     } while(chr == ',' || chr == ';');
 
-    if(openInputs && !strcmp(openInputs->name, "out") && currInputs) {
-        /* Last output can be ommitted if it is "[out]" */
+    if(open_inputs && !strcmp(open_inputs->name, "out") && curr_inputs) {
+        /* Last output can be omitted if it is "[out]" */
         const char *tmp = "[out]";
-        if(parse_outputs(&tmp, &currInputs, &openInputs,
-                         &openOutputs, log_ctx) < 0)
+        if(parse_outputs(&tmp, &curr_inputs, &open_inputs,
+                         &open_outputs, log_ctx) < 0)
             goto fail;
     }
 
@@ -388,8 +388,8 @@ int avfilter_parse_graph(AVFilterGraph *graph, const char *filters,
 
  fail:
     avfilter_destroy_graph(graph);
-    free_inout(openInputs);
-    free_inout(openOutputs);
-    free_inout(currInputs);
+    free_inout(open_inputs);
+    free_inout(open_outputs);
+    free_inout(curr_inputs);
     return -1;
 }
diff --git a/libavfilter/graphparser.h b/libavfilter/graphparser.h
index eddc6b0..725b728 100644
--- a/libavfilter/graphparser.h
+++ b/libavfilter/graphparser.h
@@ -38,10 +38,11 @@ typedef struct AVFilterInOut {
 
 /**
  * Add to a graph a graph described by a string.
+ *
  * @param graph   the filter graph where to link the parsed graph context
  * @param filters string to be parsed
- * @param inouts  linked list to the inputs and outputs of the graph
- * @param outpad  pad index of the output
+ * @param inputs  linked list to the inputs of the graph
+ * @param outputs linked list to the outputs of the graph
  * @return        zero on success, -1 on error
  */
 int avfilter_parse_graph(AVFilterGraph *graph, const char *filters,
diff --git a/libavformat/Makefile b/libavformat/Makefile
index de5cae2..10a461c 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -3,10 +3,10 @@ include $(SUBDIR)../config.mak
 NAME = avformat
 FFLIBS = avcodec avutil
 
-OBJS = allformats.o cutils.o os_support.o sdp.o utils.o
-
 HEADERS = avformat.h avio.h rtsp.h rtspcodes.h
 
+OBJS = allformats.o cutils.o metadata.o metadata_compat.o os_support.o sdp.o utils.o
+
 # muxers/demuxers
 OBJS-$(CONFIG_AAC_DEMUXER)               += raw.o
 OBJS-$(CONFIG_AC3_DEMUXER)               += raw.o
@@ -81,9 +81,9 @@ OBJS-$(CONFIG_IPOD_MUXER)                += movenc.o riff.o isom.o avc.o
 OBJS-$(CONFIG_LMLM4_DEMUXER)             += lmlm4.o
 OBJS-$(CONFIG_M4V_DEMUXER)               += raw.o
 OBJS-$(CONFIG_M4V_MUXER)                 += raw.o
-OBJS-$(CONFIG_MATROSKA_AUDIO_MUXER)      += matroskaenc.o matroska.o riff.o avc.o
-OBJS-$(CONFIG_MATROSKA_DEMUXER)          += matroskadec.o matroska.o riff.o
-OBJS-$(CONFIG_MATROSKA_MUXER)            += matroskaenc.o matroska.o riff.o avc.o
+OBJS-$(CONFIG_MATROSKA_AUDIO_MUXER)      += matroskaenc.o matroska.o riff.o isom.o avc.o
+OBJS-$(CONFIG_MATROSKA_DEMUXER)          += matroskadec.o matroska.o riff.o isom.o
+OBJS-$(CONFIG_MATROSKA_MUXER)            += matroskaenc.o matroska.o riff.o isom.o avc.o
 OBJS-$(CONFIG_MJPEG_DEMUXER)             += raw.o
 OBJS-$(CONFIG_MJPEG_MUXER)               += raw.o
 OBJS-$(CONFIG_MLP_DEMUXER)               += raw.o
@@ -158,9 +158,9 @@ OBJS-$(CONFIG_ROQ_DEMUXER)               += idroq.o
 OBJS-$(CONFIG_ROQ_MUXER)                 += raw.o
 OBJS-$(CONFIG_RPL_DEMUXER)               += rpl.o
 OBJS-$(CONFIG_RTP_MUXER)                 += rtp.o         \
-                                            rtpenc.o      \
-                                            rtp_mpv.o     \
                                             rtp_aac.o     \
+                                            rtp_mpv.o     \
+                                            rtpenc.o      \
                                             rtpenc_h264.o \
                                             avc.o
 OBJS-$(CONFIG_RTSP_DEMUXER)              += rdt.o rtsp.o
diff --git a/libavformat/allformats.c b/libavformat/allformats.c
index 8c0cd5d..43243f8 100644
--- a/libavformat/allformats.c
+++ b/libavformat/allformats.c
@@ -23,15 +23,18 @@
 #include "rdt.h"
 
 #define REGISTER_MUXER(X,x) { \
-          extern AVOutputFormat x##_muxer; \
-          if(ENABLE_##X##_MUXER)   av_register_output_format(&x##_muxer); }
+    extern AVOutputFormat x##_muxer; \
+    if(ENABLE_##X##_MUXER) av_register_output_format(&x##_muxer); }
+
 #define REGISTER_DEMUXER(X,x) { \
-          extern AVInputFormat x##_demuxer; \
-          if(ENABLE_##X##_DEMUXER) av_register_input_format(&x##_demuxer); }
+    extern AVInputFormat x##_demuxer; \
+    if(ENABLE_##X##_DEMUXER) av_register_input_format(&x##_demuxer); }
+
 #define REGISTER_MUXDEMUX(X,x)  REGISTER_MUXER(X,x); REGISTER_DEMUXER(X,x)
+
 #define REGISTER_PROTOCOL(X,x) { \
-          extern URLProtocol x##_protocol; \
-          if(ENABLE_##X##_PROTOCOL) register_protocol(&x##_protocol); }
+    extern URLProtocol x##_protocol; \
+    if(ENABLE_##X##_PROTOCOL) register_protocol(&x##_protocol); }
 
 /* If you do not call this function, then you can select exactly which
    formats you want to support */
@@ -47,7 +50,6 @@ void av_register_all(void)
         return;
     initialized = 1;
 
-    avcodec_init();
     avcodec_register_all();
 
     /* (de)muxers */
diff --git a/libavformat/asf.c b/libavformat/asf.c
index 0faa46f..33cde4c 100644
--- a/libavformat/asf.c
+++ b/libavformat/asf.c
@@ -26,7 +26,7 @@
 #include "asf.h"
 #include "asfcrypt.h"
 
-extern void ff_mms_set_stream_selection(URLContext *h, AVFormatContext *format);
+void ff_mms_set_stream_selection(URLContext *h, AVFormatContext *format);
 
 #undef NDEBUG
 #include <assert.h>
@@ -555,19 +555,14 @@ static int asf_read_header(AVFormatContext *s, AVFormatParameters *ap)
     default: var = defval; break; \
     }
 
-/**
- *
- * @return <0 in case of an error
- */
-static int asf_get_packet(AVFormatContext *s)
+int ff_asf_get_packet(AVFormatContext *s, ByteIOContext *pb)
 {
     ASFContext *asf = s->priv_data;
-    ByteIOContext *pb = s->pb;
     uint32_t packet_length, padsize;
     int rsize = 8;
     int c, d, e, off;
 
-    off= (url_ftell(s->pb) - s->data_offset) % asf->packet_size + 3;
+    off= (url_ftell(pb) - s->data_offset) % asf->packet_size + 3;
 
     c=d=e=-1;
     while(off-- > 0){
@@ -634,9 +629,8 @@ static int asf_get_packet(AVFormatContext *s)
  *
  * @return <0 if error
  */
-static int asf_read_frame_header(AVFormatContext *s){
+static int asf_read_frame_header(AVFormatContext *s, ByteIOContext *pb){
     ASFContext *asf = s->priv_data;
-    ByteIOContext *pb = s->pb;
     int rsize = 1;
     int num = get_byte(pb);
     int64_t ts0, ts1;
@@ -705,12 +699,10 @@ static int asf_read_frame_header(AVFormatContext *s){
     return 0;
 }
 
-static int asf_read_packet(AVFormatContext *s, AVPacket *pkt)
+int ff_asf_parse_packet(AVFormatContext *s, ByteIOContext *pb, AVPacket *pkt)
 {
     ASFContext *asf = s->priv_data;
     ASFStream *asf_st = 0;
-    ByteIOContext *pb = s->pb;
-    //static int pc = 0;
     for (;;) {
         if(url_feof(pb))
             return AVERROR(EIO);
@@ -723,19 +715,14 @@ static int asf_read_packet(AVFormatContext *s, AVPacket *pkt)
             /* fail safe */
             url_fskip(pb, ret);
 
-            asf->packet_pos= url_ftell(s->pb);
+            asf->packet_pos= url_ftell(pb);
             if (asf->data_object_size != (uint64_t)-1 &&
                 (asf->packet_pos - asf->data_object_offset >= asf->data_object_size))
                 return AVERROR(EIO); /* Do not exceed the size of the data object */
-            ret = asf_get_packet(s);
-            //printf("READ ASF PACKET  %d   r:%d   c:%d\n", ret, asf->packet_size_left, pc++);
-            if (ret < 0)
-                assert(asf->packet_size_left < FRAME_HEADER_SIZE || asf->packet_segments < 1);
-            asf->packet_time_start = 0;
-            continue;
+            return 1;
         }
         if (asf->packet_time_start == 0) {
-            if(asf_read_frame_header(s) < 0){
+            if(asf_read_frame_header(s, pb) < 0){
                 asf->packet_segments= 0;
                 continue;
             }
@@ -879,6 +866,24 @@ static int asf_read_packet(AVFormatContext *s, AVPacket *pkt)
     return 0;
 }
 
+static int asf_read_packet(AVFormatContext *s, AVPacket *pkt)
+{
+    ASFContext *asf = s->priv_data;
+
+    for (;;) {
+        int ret;
+
+        /* parse cached packets, if any */
+        if ((ret = ff_asf_parse_packet(s, s->pb, pkt)) <= 0)
+            return ret;
+        if ((ret = ff_asf_get_packet(s, s->pb)) < 0)
+            assert(asf->packet_size_left < FRAME_HEADER_SIZE || asf->packet_segments < 1);
+        asf->packet_time_start = 0;
+    }
+
+    return 0;
+}
+
 // Added to support seeking after packets have been read
 // If information is not reset, read_packet fails due to
 // leftover information from previous reads
diff --git a/libavformat/asf.h b/libavformat/asf.h
index 4ccae10..afb015e 100644
--- a/libavformat/asf.h
+++ b/libavformat/asf.h
@@ -286,4 +286,25 @@ static const GUID my_guid = {
 
 #define ASF_PL_FLAG_KEY_FRAME 0x80 //1000 0000
 
+extern AVInputFormat asf_demuxer;
+
+/**
+ * Load a single ASF packet into the demuxer.
+ * @param s demux context
+ * @param pb context to read data from
+ * @returns 0 on success, <0 on error
+ */
+int ff_asf_get_packet(AVFormatContext *s, ByteIOContext *pb);
+
+/**
+ * Parse data from individual ASF packets (which were previously loaded
+ * with asf_get_packet()).
+ * @param s demux context
+ * @param pb context to read data from
+ * @param pkt pointer to store packet data into
+ * @returns 0 if data was stored in pkt, <0 on error or 1 if more ASF
+ *          packets need to be loaded (through asf_get_packet())
+ */
+int ff_asf_parse_packet(AVFormatContext *s, ByteIOContext *pb, AVPacket *pkt);
+
 #endif /* AVFORMAT_ASF_H */
diff --git a/libavformat/avformat.h b/libavformat/avformat.h
index acdcec4..e8b7100 100644
--- a/libavformat/avformat.h
+++ b/libavformat/avformat.h
@@ -46,6 +46,59 @@ unsigned avformat_version(void);
 
 #include "avio.h"
 
+
+/*
+ * Public Metadata API.
+ * The metadata API allows libavformat to export metadata tags to a client
+ * application using a sequence of key/value pairs.
+ * Important concepts to keep in mind:
+ * 1. Keys are unique; there can never be 2 tags with the same key. This is
+ *    also meant semantically, i.e., a demuxer should not knowingly produce
+ *    several keys that are literally different but semantically identical.
+ *    E.g., key=Author5, key=Author6. In this example, all authors must be
+ *    placed in the same tag.
+ * 2. Metadata is flat, not hierarchical; there are no subtags. If you
+ *    want to store, e.g., the email address of the child of producer Alice
+ *    and actor Bob, that could have key=alice_and_bobs_childs_email_address.
+ * 3. A tag whose value is localized for a particular language is appended
+ *    with a dash character ('-') and the ISO 639 3-letter language code.
+ *    For example: Author-ger=Michael, Author-eng=Mike
+ *    The original/default language is in the unqualified "Author" tag.
+ *    A demuxer should set a default if it sets any translated tag.
+ */
+
+#define AV_METADATA_IGNORE_CASE     1
+#define AV_METADATA_IGNORE_SUFFIX   2
+
+typedef struct {
+    char *key;
+    char *value;
+}AVMetadataTag;
+
+typedef struct AVMetadata AVMetadata;
+
+/**
+ * gets a metadata element with matching key.
+ * @param prev set to the previous matching element to find the next.
+ * @param flags allows case as well as suffix insensitive comparissions.
+ * @return found tag or NULL, changing key or value leads to undefined behavior.
+ */
+AVMetadataTag *
+av_metadata_get(AVMetadata *m, const char *key, const AVMetadataTag *prev, int flags);
+
+/**
+ * sets the given tag in m, overwriting an existing tag.
+ * @param tag tag to add to m, key and value will be av_strduped.
+ * @return >= 0 if success otherwise error code that is <0.
+ */
+int av_metadata_set(AVMetadata **m, AVMetadataTag tag);
+
+/**
+ * Free all the memory allocated for an AVMetadata struct.
+ */
+void av_metadata_free(AVMetadata **m);
+
+
 /* packet functions */
 
 typedef struct AVPacket {
@@ -160,7 +213,7 @@ static inline void av_free_packet(AVPacket *pkt)
 */
 typedef struct AVFrac {
     int64_t val, num, den;
-} AVFrac attribute_deprecated;
+} AVFrac;
 
 /*************************************************/
 /* input/output formats */
@@ -432,6 +485,8 @@ typedef struct AVStream {
      * - decoding: Set by libavformat.
      */
     AVRational sample_aspect_ratio;
+
+    AVMetadata *metadata;
 } AVStream;
 
 #define AV_PROGRAM_RUNNING 1
@@ -450,6 +505,7 @@ typedef struct AVProgram {
     enum AVDiscard discard;        ///< selects which program to discard and which to feed to the caller
     unsigned int   *stream_index;
     unsigned int   nb_stream_indexes;
+    AVMetadata *metadata;
 } AVProgram;
 
 #define AVFMTCTX_NOHEADER      0x0001 /**< signal that no header is present
@@ -460,6 +516,7 @@ typedef struct AVChapter {
     AVRational time_base;   ///< time base in which the start/end timestamps are specified
     int64_t start, end;     ///< chapter start/end time in time_base units
     char *title;            ///< chapter title
+    AVMetadata *metadata;
 } AVChapter;
 
 #define MAX_STREAMS 20
@@ -608,6 +665,8 @@ typedef struct AVFormatContext {
     struct AVPacketList *raw_packet_buffer_end;
 
     struct AVPacketList *packet_buffer_end;
+
+    AVMetadata *metadata;
 } AVFormatContext;
 
 typedef struct AVPacketList {
@@ -1018,6 +1077,7 @@ void dump_format(AVFormatContext *ic,
                  const char *url,
                  int is_output);
 
+#if LIBAVFORMAT_VERSION_MAJOR < 53
 /**
  * Parses width and height out of string str.
  * @deprecated Use av_parse_video_frame_size instead.
@@ -1031,6 +1091,7 @@ attribute_deprecated int parse_image_size(int *width_ptr, int *height_ptr,
  */
 attribute_deprecated int parse_frame_rate(int *frame_rate, int *frame_rate_base,
                                           const char *arg);
+#endif
 
 /**
  * Parses \p datestr and returns a corresponding number of microseconds.
diff --git a/libavformat/avidec.c b/libavformat/avidec.c
index eaccd41..14f0ebd 100644
--- a/libavformat/avidec.c
+++ b/libavformat/avidec.c
@@ -163,7 +163,7 @@ static int read_braindead_odml_indx(AVFormatContext *s, int frame_num){
 #endif
             if(last_pos == pos || pos == base - 8)
                 avi->non_interleaved= 1;
-            else
+            if(last_pos != pos)
                 av_add_index_entry(st, pos, ast->cum_len / FFMAX(1, ast->sample_size), len, 0, key ? AVINDEX_KEYFRAME : 0);
 
             if(ast->sample_size)
@@ -216,13 +216,17 @@ static void clean_index(AVFormatContext *s){
     }
 }
 
-static int avi_read_tag(ByteIOContext *pb, char *buf, int maxlen,  unsigned int size)
+static int avi_read_tag(AVFormatContext *s, const char *key, unsigned int size)
 {
+    ByteIOContext *pb = s->pb;
+    uint8_t value[1024];
+
     int64_t i = url_ftell(pb);
     size += (size & 1);
-    get_strz(pb, buf, maxlen);
+    get_strz(pb, value, sizeof(value));
     url_fseek(pb, i+size, SEEK_SET);
-    return 0;
+
+    return av_metadata_set(&s->metadata, (const AVMetadataTag){key, value});
 }
 
 static int avi_read_header(AVFormatContext *s, AVFormatParameters *ap)
@@ -235,7 +239,6 @@ static int avi_read_header(AVFormatContext *s, AVFormatParameters *ap)
     int i;
     AVStream *st;
     AVIStream *ast = NULL;
-    char str_track[4];
     int avih_width=0, avih_height=0;
     int amv_file_format=0;
 
@@ -561,26 +564,25 @@ static int avi_read_header(AVFormatContext *s, AVFormatParameters *ap)
             url_fseek(pb, size, SEEK_CUR);
             break;
         case MKTAG('I', 'N', 'A', 'M'):
-            avi_read_tag(pb, s->title, sizeof(s->title), size);
+            avi_read_tag(s, "Title", size);
             break;
         case MKTAG('I', 'A', 'R', 'T'):
-            avi_read_tag(pb, s->author, sizeof(s->author), size);
+            avi_read_tag(s, "Artist", size);
             break;
         case MKTAG('I', 'C', 'O', 'P'):
-            avi_read_tag(pb, s->copyright, sizeof(s->copyright), size);
+            avi_read_tag(s, "Copyright", size);
             break;
         case MKTAG('I', 'C', 'M', 'T'):
-            avi_read_tag(pb, s->comment, sizeof(s->comment), size);
+            avi_read_tag(s, "Comment", size);
             break;
         case MKTAG('I', 'G', 'N', 'R'):
-            avi_read_tag(pb, s->genre, sizeof(s->genre), size);
+            avi_read_tag(s, "Genre", size);
             break;
         case MKTAG('I', 'P', 'R', 'D'):
-            avi_read_tag(pb, s->album, sizeof(s->album), size);
+            avi_read_tag(s, "Album", size);
             break;
         case MKTAG('I', 'P', 'R', 'T'):
-            avi_read_tag(pb, str_track, sizeof(str_track), size);
-            sscanf(str_track, "%d", &s->track);
+            avi_read_tag(s, "Track", size);
             break;
         default:
             if(size > 1000000){
@@ -668,8 +670,12 @@ static int avi_read_packet(AVFormatContext *s, AVPacket *pkt)
         best_ts= av_rescale(best_ts, best_st->time_base.den, AV_TIME_BASE * (int64_t)best_st->time_base.num); //FIXME a little ugly
         if(best_ast->remaining)
             i= av_index_search_timestamp(best_st, best_ts, AVSEEK_FLAG_ANY | AVSEEK_FLAG_BACKWARD);
-        else
+        else{
             i= av_index_search_timestamp(best_st, best_ts, AVSEEK_FLAG_ANY);
+            if(i>=0)
+                best_ast->frame_offset= best_st->index_entries[i].timestamp
+                                      * FFMAX(1, best_ast->sample_size);
+        }
 
 //        av_log(NULL, AV_LOG_DEBUG, "%d\n", i);
         if(i>=0){
diff --git a/libavformat/avienc.c b/libavformat/avienc.c
index a672e8d..cea513f 100644
--- a/libavformat/avienc.c
+++ b/libavformat/avienc.c
@@ -103,6 +103,15 @@ static void avi_write_info_tag(ByteIOContext *pb, const char *tag, const char *s
     }
 }
 
+static void avi_write_info_tag2(AVFormatContext *s, const char *fourcc, const char *key1, const char *key2)
+{
+    AVMetadataTag *tag= av_metadata_get(s->metadata, key1, NULL, AV_METADATA_IGNORE_CASE);
+    if(!tag && key2)
+        tag= av_metadata_get(s->metadata, key2, NULL, AV_METADATA_IGNORE_CASE);
+    if(tag)
+        avi_write_info_tag(s->pb, fourcc, tag->value);
+}
+
 static int avi_write_counters(AVFormatContext* s, int riff_id)
 {
     ByteIOContext *pb = s->pb;
@@ -332,17 +341,13 @@ static int avi_write_header(AVFormatContext *s)
 
     list2 = start_tag(pb, "LIST");
     put_tag(pb, "INFO");
-    avi_write_info_tag(pb, "INAM", s->title);
-    avi_write_info_tag(pb, "IART", s->author);
-    avi_write_info_tag(pb, "ICOP", s->copyright);
-    avi_write_info_tag(pb, "ICMT", s->comment);
-    avi_write_info_tag(pb, "IPRD", s->album);
-    avi_write_info_tag(pb, "IGNR", s->genre);
-    if (s->track) {
-        char str_track[4];
-        snprintf(str_track, 4, "%d", s->track);
-        avi_write_info_tag(pb, "IPRT", str_track);
-    }
+    avi_write_info_tag2(s, "INAM", "Title", NULL);
+    avi_write_info_tag2(s, "IART", "Artist", "Author");
+    avi_write_info_tag2(s, "ICOP", "Copyright", NULL);
+    avi_write_info_tag2(s, "ICMT", "Comment", NULL);
+    avi_write_info_tag2(s, "IPRD", "Album", NULL);
+    avi_write_info_tag2(s, "IGNR", "Genre", NULL);
+    avi_write_info_tag2(s, "IPRT", "Track", NULL);
     if(!(s->streams[0]->codec->flags & CODEC_FLAG_BITEXACT))
         avi_write_info_tag(pb, "ISFT", LIBAVFORMAT_IDENT);
     end_tag(pb, list2);
diff --git a/libavformat/avio.h b/libavformat/avio.h
index 687333e..3bb88b3 100644
--- a/libavformat/avio.h
+++ b/libavformat/avio.h
@@ -114,8 +114,8 @@ int av_url_read_pause(URLContext *h, int pause);
  * @return >= 0 on success
  * @see AVInputFormat::read_seek
  */
-int64_t av_url_read_seek(URLContext *h,
-                     int stream_index, int64_t timestamp, int flags);
+int64_t av_url_read_seek(URLContext *h, int stream_index,
+                         int64_t timestamp, int flags);
 
 /**
  * Passing this as the "whence" parameter to a seek function causes it to
@@ -133,8 +133,8 @@ typedef struct URLProtocol {
     int (*url_close)(URLContext *h);
     struct URLProtocol *next;
     int (*url_read_pause)(URLContext *h, int pause);
-    int64_t (*url_read_seek)(URLContext *h,
-                         int stream_index, int64_t timestamp, int flags);
+    int64_t (*url_read_seek)(URLContext *h, int stream_index,
+                             int64_t timestamp, int flags);
 } URLProtocol;
 
 extern URLProtocol *first_protocol;
@@ -170,8 +170,8 @@ typedef struct {
     unsigned long (*update_checksum)(unsigned long checksum, const uint8_t *buf, unsigned int size);
     int error;         ///< contains the error code or 0 if no error happened
     int (*read_pause)(void *opaque, int pause);
-    int64_t (*read_seek)(void *opaque,
-                     int stream_index, int64_t timestamp, int flags);
+    int64_t (*read_seek)(void *opaque, int stream_index,
+                         int64_t timestamp, int flags);
 } ByteIOContext;
 
 int init_put_byte(ByteIOContext *s,
@@ -238,8 +238,8 @@ int url_feof(ByteIOContext *s);
 int url_ferror(ByteIOContext *s);
 
 int av_url_read_fpause(ByteIOContext *h, int pause);
-int64_t av_url_read_fseek(ByteIOContext *h,
-                      int stream_index, int64_t timestamp, int flags);
+int64_t av_url_read_fseek(ByteIOContext *h, int stream_index,
+                          int64_t timestamp, int flags);
 
 #define URL_EOF (-1)
 /** @note return URL_EOF (-1) if EOF */
@@ -253,7 +253,7 @@ int url_fprintf(ByteIOContext *s, const char *fmt, ...);
 #endif
 
 /** @note unlike fgets, the EOL character is not returned and a whole
-   line is parsed. return NULL if first char read was EOF */
+    line is parsed. return NULL if first char read was EOF */
 char *url_fgets(ByteIOContext *s, char *buf, int buf_size);
 
 void put_flush_packet(ByteIOContext *s);
@@ -274,7 +274,7 @@ int get_buffer(ByteIOContext *s, unsigned char *buf, int size);
 int get_partial_buffer(ByteIOContext *s, unsigned char *buf, int size);
 
 /** @note return 0 if EOF, so you cannot use it if EOF handling is
-   necessary */
+    necessary */
 int get_byte(ByteIOContext *s);
 unsigned int get_le24(ByteIOContext *s);
 unsigned int get_le32(ByteIOContext *s);
@@ -295,7 +295,7 @@ static inline int url_is_streamed(ByteIOContext *s)
 }
 
 /** @note when opened as read/write, the buffers are only used for
-   writing */
+    writing */
 int url_fdopen(ByteIOContext **s, URLContext *h);
 
 /** @warning must be called before any I/O */
@@ -307,7 +307,7 @@ int url_setbufsize(ByteIOContext *s, int buf_size);
 int url_resetbuf(ByteIOContext *s, int flags);
 
 /** @note when opened as read/write, the buffers are only used for
-   writing */
+    writing */
 int url_fopen(ByteIOContext **s, const char *filename, int flags);
 int url_fclose(ByteIOContext *s);
 URLContext *url_fileno(ByteIOContext *s);
@@ -355,9 +355,12 @@ int url_open_dyn_packet_buf(ByteIOContext **s, int max_packet_size);
  */
 int url_close_dyn_buf(ByteIOContext *s, uint8_t **pbuffer);
 
-unsigned long ff_crc04C11DB7_update(unsigned long checksum, const uint8_t *buf, unsigned int len);
+unsigned long ff_crc04C11DB7_update(unsigned long checksum, const uint8_t *buf,
+                                    unsigned int len);
 unsigned long get_checksum(ByteIOContext *s);
-void init_checksum(ByteIOContext *s, unsigned long (*update_checksum)(unsigned long c, const uint8_t *p, unsigned int len), unsigned long checksum);
+void init_checksum(ByteIOContext *s,
+                   unsigned long (*update_checksum)(unsigned long c, const uint8_t *p, unsigned int len),
+                   unsigned long checksum);
 
 /* udp.c */
 int udp_set_remote_url(URLContext *h, const char *uri);
diff --git a/libavformat/aviobuf.c b/libavformat/aviobuf.c
index e19e70b..d1dc09b 100644
--- a/libavformat/aviobuf.c
+++ b/libavformat/aviobuf.c
@@ -40,8 +40,8 @@ int init_put_byte(ByteIOContext *s,
     s->buffer = buffer;
     s->buffer_size = buffer_size;
     s->buf_ptr = buffer;
-    url_resetbuf(s, write_flag ? URL_WRONLY : URL_RDONLY);
     s->opaque = opaque;
+    url_resetbuf(s, write_flag ? URL_WRONLY : URL_RDONLY);
     s->write_packet = write_packet;
     s->read_packet = read_packet;
     s->seek = seek;
@@ -68,7 +68,8 @@ ByteIOContext *av_alloc_put_byte(
                   void *opaque,
                   int (*read_packet)(void *opaque, uint8_t *buf, int buf_size),
                   int (*write_packet)(void *opaque, uint8_t *buf, int buf_size),
-                  int64_t (*seek)(void *opaque, int64_t offset, int whence)) {
+                  int64_t (*seek)(void *opaque, int64_t offset, int whence))
+{
     ByteIOContext *s = av_mallocz(sizeof(ByteIOContext));
     init_put_byte(s, buffer, buffer_size, write_flag, opaque,
                   read_packet, write_packet, seek);
@@ -322,17 +323,23 @@ static void fill_buffer(ByteIOContext *s)
     }
 }
 
-unsigned long ff_crc04C11DB7_update(unsigned long checksum, const uint8_t *buf, unsigned int len){
+unsigned long ff_crc04C11DB7_update(unsigned long checksum, const uint8_t *buf,
+                                    unsigned int len)
+{
     return av_crc(av_crc_get_table(AV_CRC_32_IEEE), checksum, buf, len);
 }
 
-unsigned long get_checksum(ByteIOContext *s){
+unsigned long get_checksum(ByteIOContext *s)
+{
     s->checksum= s->update_checksum(s->checksum, s->checksum_ptr, s->buf_ptr - s->checksum_ptr);
     s->update_checksum= NULL;
     return s->checksum;
 }
 
-void init_checksum(ByteIOContext *s, unsigned long (*update_checksum)(unsigned long c, const uint8_t *p, unsigned int len), unsigned long checksum){
+void init_checksum(ByteIOContext *s,
+                   unsigned long (*update_checksum)(unsigned long c, const uint8_t *p, unsigned int len),
+                   unsigned long checksum)
+{
     s->update_checksum= update_checksum;
     if(s->update_checksum){
         s->checksum= checksum;
@@ -665,8 +672,8 @@ int av_url_read_fpause(ByteIOContext *s, int pause)
     return s->read_pause(s->opaque, pause);
 }
 
-int64_t av_url_read_fseek(ByteIOContext *s,
-        int stream_index, int64_t timestamp, int flags)
+int64_t av_url_read_fseek(ByteIOContext *s, int stream_index,
+                          int64_t timestamp, int flags)
 {
     URLContext *h = s->opaque;
     int64_t ret;
diff --git a/libavformat/avs.c b/libavformat/avs.c
index 6fcb230..1fcb19f 100644
--- a/libavformat/avs.c
+++ b/libavformat/avs.c
@@ -24,7 +24,7 @@
 
 
 typedef struct avs_format {
-    voc_dec_context_t voc;
+    VocDecContext voc;
     AVStream *st_video;
     AVStream *st_audio;
     int width;
@@ -34,7 +34,7 @@ typedef struct avs_format {
     int nb_frames;
     int remaining_frame_size;
     int remaining_audio_size;
-} avs_format_t;
+} AvsFormat;
 
 typedef enum avs_block_type {
     AVS_NONE      = 0x00,
@@ -42,7 +42,7 @@ typedef enum avs_block_type {
     AVS_AUDIO     = 0x02,
     AVS_PALETTE   = 0x03,
     AVS_GAME_DATA = 0x04,
-} avs_block_type_t;
+} AvsBlockType;
 
 static int avs_probe(AVProbeData * p)
 {
@@ -57,7 +57,7 @@ static int avs_probe(AVProbeData * p)
 
 static int avs_read_header(AVFormatContext * s, AVFormatParameters * ap)
 {
-    avs_format_t *avs = s->priv_data;
+    AvsFormat *avs = s->priv_data;
 
     s->ctx_flags |= AVFMTCTX_NOHEADER;
 
@@ -82,10 +82,10 @@ static int avs_read_header(AVFormatContext * s, AVFormatParameters * ap)
 
 static int
 avs_read_video_packet(AVFormatContext * s, AVPacket * pkt,
-                      avs_block_type_t type, int sub_type, int size,
+                      AvsBlockType type, int sub_type, int size,
                       uint8_t * palette, int palette_size)
 {
-    avs_format_t *avs = s->priv_data;
+    AvsFormat *avs = s->priv_data;
     int ret;
 
     ret = av_new_packet(pkt, size + palette_size);
@@ -120,7 +120,7 @@ avs_read_video_packet(AVFormatContext * s, AVPacket * pkt,
 
 static int avs_read_audio_packet(AVFormatContext * s, AVPacket * pkt)
 {
-    avs_format_t *avs = s->priv_data;
+    AvsFormat *avs = s->priv_data;
     int ret, size;
 
     size = url_ftell(s->pb);
@@ -141,9 +141,9 @@ static int avs_read_audio_packet(AVFormatContext * s, AVPacket * pkt)
 
 static int avs_read_packet(AVFormatContext * s, AVPacket * pkt)
 {
-    avs_format_t *avs = s->priv_data;
+    AvsFormat *avs = s->priv_data;
     int sub_type = 0, size = 0;
-    avs_block_type_t type = AVS_NONE;
+    AvsBlockType type = AVS_NONE;
     int palette_size = 0;
     uint8_t palette[4 + 3 * 256];
     int ret;
@@ -218,7 +218,7 @@ static int avs_read_close(AVFormatContext * s)
 AVInputFormat avs_demuxer = {
     "avs",
     NULL_IF_CONFIG_SMALL("AVS format"),
-    sizeof(avs_format_t),
+    sizeof(AvsFormat),
     avs_probe,
     avs_read_header,
     avs_read_packet,
diff --git a/libavformat/c93.c b/libavformat/c93.c
index c377f4d..11a0314 100644
--- a/libavformat/c93.c
+++ b/libavformat/c93.c
@@ -29,7 +29,7 @@ typedef struct {
 } C93BlockRecord;
 
 typedef struct {
-    voc_dec_context_t voc;
+    VocDecContext voc;
 
     C93BlockRecord block_records[512];
     int current_block;
diff --git a/libavformat/dv.c b/libavformat/dv.c
index f7a0146..a0e07e9 100644
--- a/libavformat/dv.c
+++ b/libavformat/dv.c
@@ -430,6 +430,8 @@ static int dv_read_packet(AVFormatContext *s, AVPacket *pkt)
     size = dv_get_packet(c->dv_demux, pkt);
 
     if (size < 0) {
+        if (!c->dv_demux->sys)
+            return AVERROR(EIO);
         size = c->dv_demux->sys->frame_size;
         if (get_buffer(s->pb, c->buf, size) <= 0)
             return AVERROR(EIO);
diff --git a/libavformat/dv.h b/libavformat/dv.h
index a8c0514..6bfb4ab 100644
--- a/libavformat/dv.h
+++ b/libavformat/dv.h
@@ -38,7 +38,7 @@ void dv_offset_reset(DVDemuxContext *c, int64_t frame_offset);
 
 typedef struct DVMuxContext DVMuxContext;
 DVMuxContext* dv_init_mux(AVFormatContext* s);
-int dv_assemble_frame(DVMuxContext *c, AVStream*, const uint8_t*, int, uint8_t**);
+int dv_assemble_frame(DVMuxContext *c, AVStream*, uint8_t*, int, uint8_t**);
 void dv_delete_mux(DVMuxContext*);
 
 #endif /* AVFORMAT_DV_H */
diff --git a/libavformat/dvenc.c b/libavformat/dvenc.c
index 7245e6e..c8dc5a8 100644
--- a/libavformat/dvenc.c
+++ b/libavformat/dvenc.c
@@ -231,7 +231,7 @@ static void dv_inject_metadata(DVMuxContext *c, uint8_t* frame)
  */
 
 int dv_assemble_frame(DVMuxContext *c, AVStream* st,
-                      const uint8_t* data, int data_size, uint8_t** frame)
+                      uint8_t* data, int data_size, uint8_t** frame)
 {
     int i, reqasize;
 
diff --git a/libavformat/flvdec.c b/libavformat/flvdec.c
index 0b25a25..01da9e8 100644
--- a/libavformat/flvdec.c
+++ b/libavformat/flvdec.c
@@ -26,6 +26,10 @@
 #include "avformat.h"
 #include "flv.h"
 
+typedef struct {
+    int wrong_dts; ///< wrong dts due to negative cts
+} FLVContext;
+
 static int flv_probe(AVProbeData *p)
 {
     const uint8_t *d;
@@ -299,9 +303,10 @@ static int flv_get_extradata(AVFormatContext *s, AVStream *st, int size)
 
 static int flv_read_packet(AVFormatContext *s, AVPacket *pkt)
 {
+    FLVContext *flv = s->priv_data;
     int ret, i, type, size, flags, is_audio;
     int64_t next, pos;
-    unsigned dts;
+    int64_t dts, pts = AV_NOPTS_VALUE;
     AVStream *st = NULL;
 
  retry:
@@ -386,10 +391,12 @@ static int flv_read_packet(AVFormatContext *s, AVPacket *pkt)
     }
 
     if(is_audio){
-        if(!st->codec->channels || !st->codec->sample_rate || !st->codec->bits_per_coded_sample || (!st->codec->codec_id && !st->codec->codec_tag)) {
+        if(!st->codec->channels || !st->codec->sample_rate || !st->codec->bits_per_coded_sample) {
             st->codec->channels = (flags & FLV_AUDIO_CHANNEL_MASK) == FLV_STEREO ? 2 : 1;
             st->codec->sample_rate = (44100 << ((flags & FLV_AUDIO_SAMPLERATE_MASK) >> FLV_AUDIO_SAMPLERATE_OFFSET) >> 3);
             st->codec->bits_per_coded_sample = (flags & FLV_AUDIO_SAMPLESIZE_MASK) ? 16 : 8;
+        }
+        if(!st->codec->codec_id){
             flv_set_audio_codec(s, st, flags & FLV_AUDIO_CODECID_MASK);
         }
     }else{
@@ -401,9 +408,14 @@ static int flv_read_packet(AVFormatContext *s, AVPacket *pkt)
         int type = get_byte(s->pb);
         size--;
         if (st->codec->codec_id == CODEC_ID_H264) {
-            // cts offset ignored because it might to be signed
-            // and would cause pts < dts
-            get_be24(s->pb);
+            int32_t cts = (get_be24(s->pb)+0xff800000)^0xff800000; // sign extension
+            pts = dts + cts;
+            if (cts < 0) { // dts are wrong
+                flv->wrong_dts = 1;
+                av_log(s, AV_LOG_WARNING, "negative cts, previous timestamps might be wrong\n");
+            }
+            if (flv->wrong_dts)
+                dts = AV_NOPTS_VALUE;
         }
         if (type == 0) {
             if ((ret = flv_get_extradata(s, st, size)) < 0)
@@ -420,6 +432,7 @@ static int flv_read_packet(AVFormatContext *s, AVPacket *pkt)
        packet */
     pkt->size = ret;
     pkt->dts = dts;
+    pkt->pts = pts == AV_NOPTS_VALUE ? dts : pts;
     pkt->stream_index = st->index;
 
     if (is_audio || ((flags & FLV_VIDEO_FRAMETYPE_MASK) == FLV_FRAME_KEY))
@@ -431,7 +444,7 @@ static int flv_read_packet(AVFormatContext *s, AVPacket *pkt)
 AVInputFormat flv_demuxer = {
     "flv",
     NULL_IF_CONFIG_SMALL("FLV format"),
-    0,
+    sizeof(FLVContext),
     flv_probe,
     flv_read_header,
     flv_read_packet,
diff --git a/libavformat/framehook.h b/libavformat/framehook.h
index 9dc144b..0bad606 100644
--- a/libavformat/framehook.h
+++ b/libavformat/framehook.h
@@ -45,8 +45,8 @@ typedef void (FrameHookRelease)(void *ctx);
 typedef FrameHookRelease *FrameHookReleaseFn;
 extern FrameHookRelease Release;
 
-extern int frame_hook_add(int argc, char *argv[]);
-extern void frame_hook_process(struct AVPicture *pict, enum PixelFormat pix_fmt, int width, int height, int64_t pts);
-extern void frame_hook_release(void);
+int frame_hook_add(int argc, char *argv[]);
+void frame_hook_process(struct AVPicture *pict, enum PixelFormat pix_fmt, int width, int height, int64_t pts);
+void frame_hook_release(void);
 
 #endif /* AVFORMAT_FRAMEHOOK_H */
diff --git a/libavformat/gxf.c b/libavformat/gxf.c
index 5bbac82..30cb4f7 100644
--- a/libavformat/gxf.c
+++ b/libavformat/gxf.c
@@ -23,12 +23,12 @@
 #include "avformat.h"
 #include "gxf.h"
 
-typedef struct {
+struct gxf_stream_info {
     int64_t first_field;
     int64_t last_field;
     AVRational frames_per_second;
     int32_t fields_per_frame;
-} st_info_t;
+};
 
 /**
  * \brief parses a packet header, extracting type and length
@@ -37,7 +37,7 @@ typedef struct {
  * \param length detected packet length, excluding header is stored here
  * \return 0 if header not found or contains invalid data, 1 otherwise
  */
-static int parse_packet_header(ByteIOContext *pb, pkt_type_t *type, int *length) {
+static int parse_packet_header(ByteIOContext *pb, GXFPktType *type, int *length) {
     if (get_be32(pb))
         return 0;
     if (get_byte(pb) != 1)
@@ -157,11 +157,11 @@ static int get_sindex(AVFormatContext *s, int id, int format) {
  * \param len length of tag section, will be adjusted to contain remaining bytes
  * \param si struct to store collected information into
  */
-static void gxf_material_tags(ByteIOContext *pb, int *len, st_info_t *si) {
+static void gxf_material_tags(ByteIOContext *pb, int *len, struct gxf_stream_info *si) {
     si->first_field = AV_NOPTS_VALUE;
     si->last_field = AV_NOPTS_VALUE;
     while (*len >= 2) {
-        mat_tag_t tag = get_byte(pb);
+        GXFMatTag tag = get_byte(pb);
         int tlen = get_byte(pb);
         *len -= 2;
         if (tlen > *len)
@@ -206,11 +206,11 @@ static AVRational fps_umf2avr(uint32_t flags) {
  * \param len length of tag section, will be adjusted to contain remaining bytes
  * \param si struct to store collected information into
  */
-static void gxf_track_tags(ByteIOContext *pb, int *len, st_info_t *si) {
+static void gxf_track_tags(ByteIOContext *pb, int *len, struct gxf_stream_info *si) {
     si->frames_per_second = (AVRational){0, 0};
     si->fields_per_frame = 0;
     while (*len >= 2) {
-        track_tag_t tag = get_byte(pb);
+        GXFTrackTag tag = get_byte(pb);
         int tlen = get_byte(pb);
         *len -= 2;
         if (tlen > *len)
@@ -256,11 +256,11 @@ static void gxf_read_index(AVFormatContext *s, int pkt_len) {
 
 static int gxf_header(AVFormatContext *s, AVFormatParameters *ap) {
     ByteIOContext *pb = s->pb;
-    pkt_type_t pkt_type;
+    GXFPktType pkt_type;
     int map_len;
     int len;
     AVRational main_timebase = {0, 0};
-    st_info_t si;
+    struct gxf_stream_info si;
     int i;
     if (!parse_packet_header(pb, &pkt_type, &map_len) || pkt_type != PKT_MAP) {
         av_log(s, AV_LOG_ERROR, "map packet not found\n");
@@ -382,7 +382,7 @@ static int64_t gxf_resync_media(AVFormatContext *s, uint64_t max_interval, int t
     int64_t cur_timestamp = AV_NOPTS_VALUE;
     int len;
     ByteIOContext *pb = s->pb;
-    pkt_type_t type;
+    GXFPktType type;
     tmp = get_be32(pb);
 start:
     while (tmp)
@@ -412,7 +412,7 @@ out:
 
 static int gxf_packet(AVFormatContext *s, AVPacket *pkt) {
     ByteIOContext *pb = s->pb;
-    pkt_type_t pkt_type;
+    GXFPktType pkt_type;
     int pkt_len;
     while (!url_feof(pb)) {
         AVStream *st;
diff --git a/libavformat/gxf.h b/libavformat/gxf.h
index 4212704..dcdcdef 100644
--- a/libavformat/gxf.h
+++ b/libavformat/gxf.h
@@ -22,14 +22,13 @@
 #ifndef AVFORMAT_GXF_H
 #define AVFORMAT_GXF_H
 
-/* gxf.c */
 typedef enum {
     PKT_MAP         = 0xbc,
     PKT_MEDIA       = 0xbf,
     PKT_EOS         = 0xfb,
     PKT_FLT         = 0xfc,
     PKT_UMF         = 0xfd,
-} pkt_type_t;
+} GXFPktType;
 
 typedef enum {
     MAT_NAME        = 0x40,
@@ -38,7 +37,7 @@ typedef enum {
     MAT_MARK_IN     = 0x43,
     MAT_MARK_OUT    = 0x44,
     MAT_SIZE        = 0x45,
-} mat_tag_t;
+} GXFMatTag;
 
 typedef enum {
     TRACK_NAME      = 0x4c,
@@ -48,6 +47,6 @@ typedef enum {
     TRACK_FPS       = 0x50,
     TRACK_LINES     = 0x51,
     TRACK_FPF       = 0x52,
-} track_tag_t;
+} GXFTrackTag;
 
 #endif /* AVFORMAT_GXF_H */
diff --git a/libavformat/gxfenc.c b/libavformat/gxfenc.c
index 72a35c3..396555a 100644
--- a/libavformat/gxfenc.c
+++ b/libavformat/gxfenc.c
@@ -155,7 +155,7 @@ static int64_t updateSize(ByteIOContext *pb, int64_t pos)
     return curpos - pos;
 }
 
-static void gxf_write_packet_header(ByteIOContext *pb, pkt_type_t type)
+static void gxf_write_packet_header(ByteIOContext *pb, GXFPktType type)
 {
     put_be32(pb, 0); /* packet leader for synchro */
     put_byte(pb, 1);
diff --git a/libavformat/iff.c b/libavformat/iff.c
index 1fb94c0..e5158f6 100644
--- a/libavformat/iff.c
+++ b/libavformat/iff.c
@@ -52,7 +52,7 @@
 
 #define PACKET_SIZE 1024
 
-typedef enum {COMP_NONE, COMP_FIB, COMP_EXP} svx8_compression_t;
+typedef enum {COMP_NONE, COMP_FIB, COMP_EXP} svx8_compression_type;
 
 typedef struct {
     uint32_t  body_size;
diff --git a/libavformat/img2.c b/libavformat/img2.c
index 9706392..c0b5de2 100644
--- a/libavformat/img2.c
+++ b/libavformat/img2.c
@@ -45,6 +45,7 @@ static const IdStrMap img_tags[] = {
     { CODEC_ID_PNG       , "png"},
     { CODEC_ID_PNG       , "mng"},
     { CODEC_ID_PPM       , "ppm"},
+    { CODEC_ID_PPM       , "pnm"},
     { CODEC_ID_PGM       , "pgm"},
     { CODEC_ID_PGMYUV    , "pgmyuv"},
     { CODEC_ID_PBM       , "pbm"},
@@ -69,6 +70,7 @@ static const IdStrMap img_tags[] = {
     { CODEC_ID_SUNRAST   , "im8"},
     { CODEC_ID_SUNRAST   , "im24"},
     { CODEC_ID_SUNRAST   , "sunras"},
+    { CODEC_ID_JPEG2000  , "jp2"},
     { CODEC_ID_NONE      , NULL}
 };
 
@@ -402,7 +404,7 @@ AVOutputFormat image2_muxer = {
     "image2",
     NULL_IF_CONFIG_SMALL("image2 sequence"),
     "",
-    "",
+    "bmp,jpeg,jpg,ljpg,pam,pbm,pgm,pgmyuv,png,ppm,sgi,tif,tiff",
     sizeof(VideoData),
     CODEC_ID_NONE,
     CODEC_ID_MJPEG,
diff --git a/libavformat/internal.h b/libavformat/internal.h
index a96f365..cb266ca 100644
--- a/libavformat/internal.h
+++ b/libavformat/internal.h
@@ -22,7 +22,11 @@
 #define AVFORMAT_INTERNAL_H
 
 #include <stdint.h>
+#include "avformat.h"
 
 char *ff_data_to_hex(char *buf, const uint8_t *src, int size);
 
+void av_set_program_name(AVProgram *program, char *provider_name, char *name);
+void av_program_add_stream_index(AVFormatContext *ac, int progid, unsigned int idx);
+
 #endif /* AVFORMAT_INTERNAL_H */
diff --git a/libavformat/ipmovie.c b/libavformat/ipmovie.c
index 4a766b2..35c61de 100644
--- a/libavformat/ipmovie.c
+++ b/libavformat/ipmovie.c
@@ -101,7 +101,7 @@ typedef struct IPMVEContext {
     unsigned int audio_bits;
     unsigned int audio_channels;
     unsigned int audio_sample_rate;
-    unsigned int audio_type;
+    enum CodecID audio_type;
     unsigned int audio_frame_count;
 
     int video_stream_index;
@@ -544,7 +544,7 @@ static int ipmovie_read_header(AVFormatContext *s,
     url_fseek(pb, -CHUNK_PREAMBLE_SIZE, SEEK_CUR);
 
     if (chunk_type == CHUNK_VIDEO)
-        ipmovie->audio_type = 0;  /* no audio */
+        ipmovie->audio_type = CODEC_ID_NONE;  /* no audio */
     else if (process_ipmovie_chunk(ipmovie, pb, &pkt) != CHUNK_INIT_AUDIO)
         return AVERROR_INVALIDDATA;
 
diff --git a/libavformat/isom.c b/libavformat/isom.c
index 8791354..7d1da7c 100644
--- a/libavformat/isom.c
+++ b/libavformat/isom.c
@@ -106,6 +106,8 @@ const AVCodecTag codec_movvideo_tags[] = {
     { CODEC_ID_MSRLE,   MKTAG('W', 'R', 'L', 'E') },
     { CODEC_ID_QDRAW,   MKTAG('q', 'd', 'r', 'w') }, /* QuickDraw */
 
+    { CODEC_ID_RAWVIDEO, MKTAG('W', 'R', 'A', 'W') },
+
     { CODEC_ID_H264, MKTAG('a', 'v', 'c', '1') }, /* AVC-1/H.264 */
 
     { CODEC_ID_MPEG1VIDEO, MKTAG('m', 'p', 'e', 'g') }, /* MPEG */
@@ -146,7 +148,7 @@ const AVCodecTag codec_movvideo_tags[] = {
     { CODEC_ID_MPEG2VIDEO, MKTAG('x', 'd', 'v', 'f') }, /* XDCAM EX 1080p30 VBR */
     { CODEC_ID_MPEG2VIDEO, MKTAG('A', 'V', 'm', 'p') }, /* AVID IMX PAL */
 
-  //{ CODEC_ID_JPEG2000, MKTAG('m', 'j', 'p', '2') }, /* JPEG 2000 produced by FCP */
+    { CODEC_ID_JPEG2000, MKTAG('m', 'j', 'p', '2') }, /* JPEG 2000 produced by FCP */
 
     { CODEC_ID_TARGA, MKTAG('t', 'g', 'a', ' ') }, /* Truevision Targa */
     { CODEC_ID_TIFF,  MKTAG('t', 'i', 'f', 'f') }, /* TIFF embedded in MOV */
diff --git a/libavformat/isom.h b/libavformat/isom.h
index 2fb4dbf..64bde46 100644
--- a/libavformat/isom.h
+++ b/libavformat/isom.h
@@ -38,6 +38,6 @@ int ff_mov_lang_to_iso639(int code, char *to);
 typedef struct {
     int count;
     int duration;
-} MOV_stts_t;
+} MOVStts;
 
 #endif /* AVFORMAT_ISOM_H */
diff --git a/libavformat/libnut.c b/libavformat/libnut.c
index 399ef9c..3f4a69b 100644
--- a/libavformat/libnut.c
+++ b/libavformat/libnut.c
@@ -33,8 +33,8 @@
 #define ID_LENGTH (strlen(ID_STRING) + 1)
 
 typedef struct {
-    nut_context_t * nut;
-    nut_stream_header_t * s;
+    nut_context_tt * nut;
+    nut_stream_header_tt * s;
 } NUTContext;
 
 static const AVCodecTag nut_tags[] = {
@@ -55,7 +55,7 @@ static int av_write(void * h, size_t len, const uint8_t * buf) {
 static int nut_write_header(AVFormatContext * avf) {
     NUTContext * priv = avf->priv_data;
     ByteIOContext * bc = avf->pb;
-    nut_muxer_opts_t mopts = {
+    nut_muxer_opts_tt mopts = {
         .output = {
             .priv = bc,
             .write = av_write,
@@ -66,7 +66,7 @@ static int nut_write_header(AVFormatContext * avf) {
         .max_distance = 32768,
         .fti = NULL,
     };
-    nut_stream_header_t * s;
+    nut_stream_header_tt * s;
     int i;
 
     priv->s = s = av_mallocz((avf->nb_streams + 1) * sizeof*s);
@@ -123,7 +123,7 @@ static int nut_write_header(AVFormatContext * avf) {
 
 static int nut_write_packet(AVFormatContext * avf, AVPacket * pkt) {
     NUTContext * priv = avf->priv_data;
-    nut_packet_t p;
+    nut_packet_tt p;
 
     p.len = pkt->size;
     p.stream = pkt->stream_index;
@@ -188,7 +188,7 @@ static off_t av_seek(void * h, long long pos, int whence) {
 static int nut_read_header(AVFormatContext * avf, AVFormatParameters * ap) {
     NUTContext * priv = avf->priv_data;
     ByteIOContext * bc = avf->pb;
-    nut_demuxer_opts_t dopts = {
+    nut_demuxer_opts_tt dopts = {
         .input = {
             .priv = bc,
             .seek = av_seek,
@@ -200,8 +200,8 @@ static int nut_read_header(AVFormatContext * avf, AVFormatParameters * ap) {
         .read_index = 1,
         .cache_syncpoints = 1,
     };
-    nut_context_t * nut = priv->nut = nut_demuxer_init(&dopts);
-    nut_stream_header_t * s;
+    nut_context_tt * nut = priv->nut = nut_demuxer_init(&dopts);
+    nut_stream_header_tt * s;
     int ret, i;
 
     if ((ret = nut_read_headers(nut, &s, NULL))) {
@@ -258,7 +258,7 @@ static int nut_read_header(AVFormatContext * avf, AVFormatParameters * ap) {
 
 static int nut_read_packet(AVFormatContext * avf, AVPacket * pkt) {
     NUTContext * priv = avf->priv_data;
-    nut_packet_t pd;
+    nut_packet_tt pd;
     int ret;
 
     ret = nut_read_next_packet(priv->nut, &pd);
diff --git a/libavformat/matroska.c b/libavformat/matroska.c
index 0b657e1..f967f3e 100644
--- a/libavformat/matroska.c
+++ b/libavformat/matroska.c
@@ -22,25 +22,16 @@
 #include "matroska.h"
 
 const CodecTags ff_mkv_codec_tags[]={
-    {"V_UNCOMPRESSED"   , CODEC_ID_RAWVIDEO},
-    {"V_MPEG4/ISO/ASP"  , CODEC_ID_MPEG4},
-    {"V_MPEG4/ISO/SP"   , CODEC_ID_MPEG4},
-    {"V_MPEG4/ISO/AP"   , CODEC_ID_MPEG4},
-    {"V_MPEG4/ISO/AVC"  , CODEC_ID_H264},
-    {"V_MPEG4/MS/V3"    , CODEC_ID_MSMPEG4V3},
-    {"V_MPEG1"          , CODEC_ID_MPEG1VIDEO},
-    {"V_MPEG2"          , CODEC_ID_MPEG2VIDEO},
-    {"V_MJPEG"          , CODEC_ID_MJPEG},
-    {"V_REAL/RV10"      , CODEC_ID_RV10},
-    {"V_REAL/RV20"      , CODEC_ID_RV20},
-    {"V_REAL/RV30"      , CODEC_ID_RV30},
-    {"V_REAL/RV40"      , CODEC_ID_RV40},
-    {"V_THEORA"         , CODEC_ID_THEORA},
-    {"V_SNOW"           , CODEC_ID_SNOW},
-
-    {"A_MPEG/L3"        , CODEC_ID_MP3},
+    {"A_AAC"            , CODEC_ID_AAC},
+    {"A_AC3"            , CODEC_ID_AC3},
+    {"A_DTS"            , CODEC_ID_DTS},
+    {"A_EAC3"           , CODEC_ID_EAC3},
+    {"A_FLAC"           , CODEC_ID_FLAC},
     {"A_MPEG/L2"        , CODEC_ID_MP2},
     {"A_MPEG/L1"        , CODEC_ID_MP2},
+    {"A_MPEG/L3"        , CODEC_ID_MP3},
+    {"A_PCM/FLOAT/IEEE" , CODEC_ID_PCM_F32LE},
+    {"A_PCM/FLOAT/IEEE" , CODEC_ID_PCM_F64LE},
     {"A_PCM/INT/BIG"    , CODEC_ID_PCM_S16BE},
     {"A_PCM/INT/BIG"    , CODEC_ID_PCM_S24BE},
     {"A_PCM/INT/BIG"    , CODEC_ID_PCM_S32BE},
@@ -48,22 +39,15 @@ const CodecTags ff_mkv_codec_tags[]={
     {"A_PCM/INT/LIT"    , CODEC_ID_PCM_S24LE},
     {"A_PCM/INT/LIT"    , CODEC_ID_PCM_S32LE},
     {"A_PCM/INT/LIT"    , CODEC_ID_PCM_U8},
-    {"A_PCM/FLOAT/IEEE" , CODEC_ID_PCM_F32LE},
-    {"A_PCM/FLOAT/IEEE" , CODEC_ID_PCM_F64LE},
-    {"A_AC3"            , CODEC_ID_AC3},
-    {"A_EAC3"           , CODEC_ID_EAC3},
-    {"A_DTS"            , CODEC_ID_DTS},
-    {"A_VORBIS"         , CODEC_ID_VORBIS},
-    {"A_AAC"            , CODEC_ID_AAC},
-    {"A_FLAC"           , CODEC_ID_FLAC},
-    {"A_WAVPACK4"       , CODEC_ID_WAVPACK},
-    {"A_TTA1"           , CODEC_ID_TTA},
+    {"A_QUICKTIME/QDM2" , CODEC_ID_QDM2},
     {"A_REAL/14_4"      , CODEC_ID_RA_144},
     {"A_REAL/28_8"      , CODEC_ID_RA_288},
     {"A_REAL/ATRC"      , CODEC_ID_ATRAC3},
     {"A_REAL/COOK"      , CODEC_ID_COOK},
 //    {"A_REAL/SIPR"      , CODEC_ID_SIPRO},
-    {"A_QUICKTIME/QDM2" , CODEC_ID_QDM2},
+    {"A_TTA1"           , CODEC_ID_TTA},
+    {"A_VORBIS"         , CODEC_ID_VORBIS},
+    {"A_WAVPACK4"       , CODEC_ID_WAVPACK},
 
     {"S_TEXT/UTF8"      , CODEC_ID_TEXT},
     {"S_TEXT/ASCII"     , CODEC_ID_TEXT},
@@ -73,6 +57,23 @@ const CodecTags ff_mkv_codec_tags[]={
     {"S_SSA"            , CODEC_ID_SSA},
     {"S_VOBSUB"         , CODEC_ID_DVD_SUBTITLE},
 
+    {"V_DIRAC"          , CODEC_ID_DIRAC},
+    {"V_MJPEG"          , CODEC_ID_MJPEG},
+    {"V_MPEG1"          , CODEC_ID_MPEG1VIDEO},
+    {"V_MPEG2"          , CODEC_ID_MPEG2VIDEO},
+    {"V_MPEG4/ISO/ASP"  , CODEC_ID_MPEG4},
+    {"V_MPEG4/ISO/AP"   , CODEC_ID_MPEG4},
+    {"V_MPEG4/ISO/SP"   , CODEC_ID_MPEG4},
+    {"V_MPEG4/ISO/AVC"  , CODEC_ID_H264},
+    {"V_MPEG4/MS/V3"    , CODEC_ID_MSMPEG4V3},
+    {"V_REAL/RV10"      , CODEC_ID_RV10},
+    {"V_REAL/RV20"      , CODEC_ID_RV20},
+    {"V_REAL/RV30"      , CODEC_ID_RV30},
+    {"V_REAL/RV40"      , CODEC_ID_RV40},
+    {"V_SNOW"           , CODEC_ID_SNOW},
+    {"V_THEORA"         , CODEC_ID_THEORA},
+    {"V_UNCOMPRESSED"   , CODEC_ID_RAWVIDEO},
+
     {""                 , CODEC_ID_NONE}
 };
 
diff --git a/libavformat/matroskadec.c b/libavformat/matroskadec.c
index cedfb3d..6478d84 100644
--- a/libavformat/matroskadec.c
+++ b/libavformat/matroskadec.c
@@ -1074,6 +1074,7 @@ static int matroska_read_header(AVFormatContext *s, AVFormatParameters *ap)
     MatroskaTrack *tracks;
     EbmlList *index_list;
     MatroskaIndex *index;
+    int index_scale = 1;
     Ebml ebml = { 0 };
     AVStream *st;
     int i, j;
@@ -1303,7 +1304,8 @@ static int matroska_read_header(AVFormatContext *s, AVFormatParameters *ap)
             st->codec->extradata = extradata;
             st->codec->extradata_size = extradata_size;
         } else if(track->codec_priv.data && track->codec_priv.size > 0){
-            st->codec->extradata = av_malloc(track->codec_priv.size);
+            st->codec->extradata = av_mallocz(track->codec_priv.size +
+                                              FF_INPUT_BUFFER_PADDING_SIZE);
             if(st->codec->extradata == NULL)
                 return AVERROR(ENOMEM);
             st->codec->extradata_size = track->codec_priv.size;
@@ -1369,6 +1371,11 @@ static int matroska_read_header(AVFormatContext *s, AVFormatParameters *ap)
 
     index_list = &matroska->index;
     index = index_list->elem;
+    if (index_list->nb_elem
+        && index[0].time > 100000000000000/matroska->time_scale) {
+        av_log(matroska->ctx, AV_LOG_WARNING, "Working around broken index.\n");
+        index_scale = matroska->time_scale;
+    }
     for (i=0; i<index_list->nb_elem; i++) {
         EbmlList *pos_list = &index[i].pos;
         MatroskaIndexPos *pos = pos_list->elem;
@@ -1378,7 +1385,8 @@ static int matroska_read_header(AVFormatContext *s, AVFormatParameters *ap)
             if (track && track->stream)
                 av_add_index_entry(track->stream,
                                    pos[j].pos + matroska->segment_start,
-                                   index[i].time, 0, 0, AVINDEX_KEYFRAME);
+                                   index[i].time/index_scale, 0, 0,
+                                   AVINDEX_KEYFRAME);
         }
     }
 
diff --git a/libavformat/metadata.c b/libavformat/metadata.c
new file mode 100644
index 0000000..9765e7c
--- /dev/null
+++ b/libavformat/metadata.c
@@ -0,0 +1,89 @@
+/*
+ * copyright (c) 2009 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "metadata.h"
+
+AVMetadataTag *
+av_metadata_get(AVMetadata *m, const char *key, const AVMetadataTag *prev, int flags)
+{
+    unsigned int i, j;
+
+    if(!m)
+        return NULL;
+
+    if(prev) i= prev - m->elems + 1;
+    else     i= 0;
+
+    for(; i<m->count; i++){
+        const char *s= m->elems[i].key;
+        if(flags & AV_METADATA_IGNORE_CASE) for(j=0; toupper(s[j]) == toupper(key[j]) && key[j]; j++);
+        else                                for(j=0;         s[j]  ==         key[j]  && key[j]; j++);
+        if(key[j])
+            continue;
+        if(s[j] && !(flags & AV_METADATA_IGNORE_SUFFIX))
+            continue;
+        return &m->elems[i];
+    }
+    return NULL;
+}
+
+int av_metadata_set(AVMetadata **pm, AVMetadataTag elem)
+{
+    AVMetadata *m= *pm;
+    AVMetadataTag *tag= av_metadata_get(m, elem.key, NULL, 0);
+
+    if(!m)
+        m=*pm= av_mallocz(sizeof(*m));
+
+    if(tag){
+        av_free(tag->value);
+        av_free(tag->key);
+        *tag= m->elems[--m->count];
+    }else{
+        AVMetadataTag *tmp= av_realloc(m->elems, (m->count+1) * sizeof(*m->elems));
+        if(tmp){
+            m->elems= tmp;
+        }else
+            return AVERROR(ENOMEM);
+    }
+    if(elem.value){
+        elem.key  = av_strdup(elem.key  );
+        elem.value= av_strdup(elem.value);
+        m->elems[m->count++]= elem;
+    }
+    if(!m->count)
+        av_freep(pm);
+
+    return 0;
+}
+
+void av_metadata_free(AVMetadata **pm)
+{
+    AVMetadata *m= *pm;
+
+    if(m){
+        while(m->count--){
+            av_free(m->elems[m->count].key);
+            av_free(m->elems[m->count].value);
+        }
+        av_free(m->elems);
+    }
+    av_freep(pm);
+}
diff --git a/libavformat/metadata.h b/libavformat/metadata.h
new file mode 100644
index 0000000..b6912ef
--- /dev/null
+++ b/libavformat/metadata.h
@@ -0,0 +1,43 @@
+/*
+ * copyright (c) 2009 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFORMAT_METADATA_H
+#define AVFORMAT_METADATA_H
+
+/**
+ * @file metadata.h
+ * internal metadata API header
+ * see avformat.h or the public API!
+ */
+
+
+#include "avformat.h"
+
+struct AVMetadata{
+    int count;
+    AVMetadataTag *elems;
+};
+
+#if LIBAVFORMAT_VERSION_MAJOR < 53
+void ff_metadata_demux_compat(AVFormatContext *s);
+void ff_metadata_mux_compat(AVFormatContext *s);
+#endif
+
+#endif /* AVFORMAT_METADATA_H */
diff --git a/libavformat/metadata_compat.c b/libavformat/metadata_compat.c
new file mode 100644
index 0000000..7aef938
--- /dev/null
+++ b/libavformat/metadata_compat.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2009  Aurelien Jacobs <aurel at gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#if LIBAVFORMAT_VERSION_MAJOR < 53
+
+#include <strings.h>
+#include "metadata.h"
+#include "libavutil/avstring.h"
+
+#define SIZE_OFFSET(x) sizeof(((AVFormatContext*)0)->x),offsetof(AVFormatContext,x)
+
+static const struct {
+    const char name[16];
+    int   size;
+    int   offset;
+} compat_tab[] = {
+    { "title",           SIZE_OFFSET(title)     },
+    { "author",          SIZE_OFFSET(author)    },
+    { "copyright",       SIZE_OFFSET(copyright) },
+    { "comment",         SIZE_OFFSET(comment)   },
+    { "album",           SIZE_OFFSET(album)     },
+    { "year",            SIZE_OFFSET(year)      },
+    { "track",           SIZE_OFFSET(track)     },
+    { "genre",           SIZE_OFFSET(genre)     },
+
+    { "artist",          SIZE_OFFSET(author)    },
+    { "creator",         SIZE_OFFSET(author)    },
+    { "written_by",      SIZE_OFFSET(author)    },
+    { "lead_performer",  SIZE_OFFSET(author)    },
+    { "description",     SIZE_OFFSET(comment)   },
+    { "albumtitle",      SIZE_OFFSET(album)     },
+    { "date_written",    SIZE_OFFSET(year)      },
+    { "date_released",   SIZE_OFFSET(year)      },
+    { "tracknumber",     SIZE_OFFSET(track)     },
+    { "part_number",     SIZE_OFFSET(track)     },
+};
+
+void ff_metadata_demux_compat(AVFormatContext *ctx)
+{
+    AVMetadata *m;
+    int i, j;
+
+    if ((m = ctx->metadata))
+        for (j=0; j<m->count; j++)
+            for (i=0; i<FF_ARRAY_ELEMS(compat_tab); i++)
+                if (!strcasecmp(m->elems[j].key, compat_tab[i].name)) {
+                    int *ptr = (int *)((char *)ctx+compat_tab[i].offset);
+                    if (*ptr)  continue;
+                    if (compat_tab[i].size > sizeof(int))
+                        av_strlcpy((char *)ptr, m->elems[j].value, compat_tab[i].size);
+                    else
+                        *ptr = atoi(m->elems[j].value);
+                }
+
+    for (i=0; i<ctx->nb_chapters; i++)
+        if ((m = ctx->chapters[i]->metadata))
+            for (j=0; j<m->count; j++)
+                if (!strcasecmp(m->elems[j].key, "title")) {
+                    av_free(ctx->chapters[i]->title);
+                    ctx->chapters[i]->title = av_strdup(m->elems[j].value);
+                }
+
+    for (i=0; i<ctx->nb_programs; i++)
+        if ((m = ctx->programs[i]->metadata))
+            for (j=0; j<m->count; j++) {
+                if (!strcasecmp(m->elems[j].key, "name")) {
+                    av_free(ctx->programs[i]->name);
+                    ctx->programs[i]->name = av_strdup(m->elems[j].value);
+                }
+                if (!strcasecmp(m->elems[j].key, "provider_name")) {
+                    av_free(ctx->programs[i]->provider_name);
+                    ctx->programs[i]->provider_name = av_strdup(m->elems[j].value);
+                }
+            }
+
+    for (i=0; i<ctx->nb_streams; i++)
+        if ((m = ctx->streams[i]->metadata))
+            for (j=0; j<m->count; j++) {
+                if (!strcasecmp(m->elems[j].key, "language"))
+                    av_strlcpy(ctx->streams[i]->language, m->elems[j].value, 4);
+                if (!strcasecmp(m->elems[j].key, "filename")) {
+                    av_free(ctx->streams[i]->filename);
+                    ctx->streams[i]->filename= av_strdup(m->elems[j].value);
+                }
+            }
+}
+
+
+#define FILL_METADATA(s, key, value) {                                        \
+    if (value && *value &&                                                    \
+        !av_metadata_get(s->metadata, #key, NULL, AV_METADATA_IGNORE_CASE))   \
+        av_metadata_set(&s->metadata, (const AVMetadataTag){#key, value});    \
+    }
+#define FILL_METADATA_STR(s, key)  FILL_METADATA(s, key, s->key)
+#define FILL_METADATA_INT(s, key) {                                           \
+    char number[10];                                                          \
+    snprintf(number, sizeof(number), "%d", s->key);                           \
+    if(s->key)  FILL_METADATA(s, key, number) }
+
+void ff_metadata_mux_compat(AVFormatContext *ctx)
+{
+    int i;
+
+    FILL_METADATA_STR(ctx, title);
+    FILL_METADATA_STR(ctx, author);
+    FILL_METADATA_STR(ctx, copyright);
+    FILL_METADATA_STR(ctx, comment);
+    FILL_METADATA_STR(ctx, album);
+    FILL_METADATA_INT(ctx, year);
+    FILL_METADATA_INT(ctx, track);
+    FILL_METADATA_STR(ctx, genre);
+    for (i=0; i<ctx->nb_chapters; i++)
+        FILL_METADATA_STR(ctx->chapters[i], title);
+    for (i=0; i<ctx->nb_programs; i++) {
+        FILL_METADATA_STR(ctx->programs[i], name);
+        FILL_METADATA_STR(ctx->programs[i], provider_name);
+    }
+    for (i=0; i<ctx->nb_streams; i++) {
+        FILL_METADATA_STR(ctx->streams[i], language);
+        FILL_METADATA_STR(ctx->streams[i], filename);
+    }
+}
+
+#endif /* LIBAVFORMAT_VERSION_MAJOR < 53 */
diff --git a/libavformat/mov.c b/libavformat/mov.c
index 32cb909..c0c057a 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -69,18 +69,18 @@ typedef struct {
     int first;
     int count;
     int id;
-} MOV_stsc_t;
+} MOVStsc;
 
 typedef struct {
     uint32_t type;
     char *path;
-} MOV_dref_t;
+} MOVDref;
 
 typedef struct {
     uint32_t type;
     int64_t offset;
     int64_t size; /* total size (excluding the size and type fields) */
-} MOV_atom_t;
+} MOVAtom;
 
 struct MOVParseTableEntry;
 
@@ -109,12 +109,12 @@ typedef struct MOVStreamContext {
     unsigned int chunk_count;
     int64_t *chunk_offsets;
     unsigned int stts_count;
-    MOV_stts_t *stts_data;
+    MOVStts *stts_data;
     unsigned int ctts_count;
-    MOV_stts_t *ctts_data;
+    MOVStts *ctts_data;
     unsigned int edit_count; /* number of 'edit' (elst atom) */
     unsigned int sample_to_chunk_sz;
-    MOV_stsc_t *sample_to_chunk;
+    MOVStsc *sample_to_chunk;
     int sample_to_ctime_index;
     int sample_to_ctime_sample;
     unsigned int sample_size;
@@ -131,7 +131,7 @@ typedef struct MOVStreamContext {
     int pseudo_stream_id; ///< -1 means demux all ids
     int16_t audio_cid; ///< stsd audio compression id
     unsigned drefs_count;
-    MOV_dref_t *drefs;
+    MOVDref *drefs;
     int dref_id;
     int wrong_dts; ///< dts are wrong due to negative ctts
 } MOVStreamContext;
@@ -149,6 +149,7 @@ typedef struct MOVContext {
     MOVFragment fragment; ///< current fragment in moof atom
     MOVTrackExt *trex_data;
     unsigned trex_count;
+    int itunes_metadata; ///< metadata are itunes style
 } MOVContext;
 
 
@@ -162,15 +163,15 @@ typedef struct MOVContext {
 /* links atom IDs to parse functions */
 typedef struct MOVParseTableEntry {
     uint32_t type;
-    int (*parse)(MOVContext *ctx, ByteIOContext *pb, MOV_atom_t atom);
+    int (*parse)(MOVContext *ctx, ByteIOContext *pb, MOVAtom atom);
 } MOVParseTableEntry;
 
 static const MOVParseTableEntry mov_default_parse_table[];
 
-static int mov_read_default(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_default(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     int64_t total_size = 0;
-    MOV_atom_t a;
+    MOVAtom a;
     int i;
     int err = 0;
 
@@ -231,7 +232,7 @@ static int mov_read_default(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return err;
 }
 
-static int mov_read_dref(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_dref(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     MOVStreamContext *sc = st->priv_data;
@@ -245,7 +246,7 @@ static int mov_read_dref(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     sc->drefs = av_mallocz(entries * sizeof(*sc->drefs));
 
     for (i = 0; i < sc->drefs_count; i++) {
-        MOV_dref_t *dref = &sc->drefs[i];
+        MOVDref *dref = &sc->drefs[i];
         uint32_t size = get_be32(pb);
         int64_t next = url_ftell(pb) + size - 4;
 
@@ -299,7 +300,7 @@ static int mov_read_dref(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_hdlr(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_hdlr(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     uint32_t type;
@@ -372,7 +373,7 @@ static const AVCodecTag mp4_audio_types[] = {
     { CODEC_ID_NONE,    0 },
 };
 
-static int mov_read_esds(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_esds(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     int tag, len;
@@ -429,8 +430,24 @@ static int mov_read_esds(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
+static int mov_read_pasp(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
+{
+    const int num = get_be32(pb);
+    const int den = get_be32(pb);
+    AVStream * const st = c->fc->streams[c->fc->nb_streams-1];
+    if (den != 0) {
+        if ((st->sample_aspect_ratio.den && den != st->sample_aspect_ratio.den) ||
+            (st->sample_aspect_ratio.num && num != st->sample_aspect_ratio.num))
+            av_log(c->fc, AV_LOG_WARNING,
+                   "sample aspect ratio already set, overriding by 'pasp' atom\n");
+        st->sample_aspect_ratio.num = num;
+        st->sample_aspect_ratio.den = den;
+    }
+    return 0;
+}
+
 /* this atom contains actual media data */
-static int mov_read_mdat(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_mdat(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     if(atom.size == 0) /* wrong one (MP4) */
         return 0;
@@ -438,7 +455,7 @@ static int mov_read_mdat(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0; /* now go for moov */
 }
 
-static int mov_read_ftyp(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_ftyp(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     uint32_t type = get_le32(pb);
 
@@ -451,7 +468,7 @@ static int mov_read_ftyp(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
 }
 
 /* this atom should contain all header atoms */
-static int mov_read_moov(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_moov(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     if (mov_read_default(c, pb, atom) < 0)
         return -1;
@@ -461,14 +478,14 @@ static int mov_read_moov(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0; /* now go for mdat */
 }
 
-static int mov_read_moof(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_moof(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     c->fragment.moof_offset = url_ftell(pb) - 8;
     dprintf(c->fc, "moof offset %llx\n", c->fragment.moof_offset);
     return mov_read_default(c, pb, atom);
 }
 
-static int mov_read_mdhd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_mdhd(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     MOVStreamContext *sc = st->priv_data;
@@ -497,7 +514,7 @@ static int mov_read_mdhd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_mvhd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_mvhd(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     int version = get_byte(pb); /* version */
     get_be24(pb); /* flags */
@@ -533,7 +550,7 @@ static int mov_read_mvhd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_smi(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_smi(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
 
@@ -553,12 +570,13 @@ static int mov_read_smi(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_enda(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_enda(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     int little_endian = get_be16(pb);
 
-    if (little_endian) {
+    dprintf(c->fc, "enda %d\n", little_endian);
+    if (little_endian == 1) {
         switch (st->codec->codec_id) {
         case CODEC_ID_PCM_S24BE:
             st->codec->codec_id = CODEC_ID_PCM_S24LE;
@@ -580,11 +598,16 @@ static int mov_read_enda(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
 }
 
 /* FIXME modify qdm2/svq3/h264 decoders to take full atom as extradata */
-static int mov_read_extradata(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_extradata(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
-    AVStream *st = c->fc->streams[c->fc->nb_streams-1];
-    uint64_t size= (uint64_t)st->codec->extradata_size + atom.size + 8 + FF_INPUT_BUFFER_PADDING_SIZE;
+    AVStream *st;
+    uint64_t size;
     uint8_t *buf;
+
+    if (c->fc->nb_streams < 1) // will happen with jp2 files
+        return 0;
+    st= c->fc->streams[c->fc->nb_streams-1];
+    size= (uint64_t)st->codec->extradata_size + atom.size + 8 + FF_INPUT_BUFFER_PADDING_SIZE;
     if(size > INT_MAX || (uint64_t)atom.size > INT_MAX)
         return -1;
     buf= av_realloc(st->codec->extradata, size);
@@ -599,7 +622,7 @@ static int mov_read_extradata(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_wave(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_wave(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
 
@@ -626,7 +649,7 @@ static int mov_read_wave(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
  * This function reads atom content and puts data in extradata without tag
  * nor size unlike mov_read_extradata.
  */
-static int mov_read_glbl(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_glbl(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
 
@@ -642,7 +665,7 @@ static int mov_read_glbl(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_stco(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_stco(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     MOVStreamContext *sc = st->priv_data;
@@ -707,7 +730,7 @@ static enum CodecID mov_get_lpcm_codec_id(int bps, int flags)
     return CODEC_ID_NONE;
 }
 
-static int mov_read_stsd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_stsd(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     MOVStreamContext *sc = st->priv_data;
@@ -722,7 +745,7 @@ static int mov_read_stsd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
         //Parsing Sample description table
         enum CodecID id;
         int dref_id;
-        MOV_atom_t a = { 0, 0, 0 };
+        MOVAtom a = { 0, 0, 0 };
         int64_t start_pos = url_ftell(pb);
         int size = get_be32(pb); /* size */
         uint32_t format = get_le32(pb); /* data format */
@@ -987,6 +1010,9 @@ static int mov_read_stsd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
 #endif
     /* no ifdef since parameters are always those */
     case CODEC_ID_QCELP:
+        st->codec->frame_size= 160;
+        st->codec->channels= 1; /* really needed */
+        break;
     case CODEC_ID_AMR_NB:
     case CODEC_ID_AMR_WB:
         st->codec->frame_size= sc->samples_per_frame;
@@ -1008,8 +1034,10 @@ static int mov_read_stsd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
         st->codec->block_align = sc->bytes_per_frame;
         break;
     case CODEC_ID_ALAC:
-        if (st->codec->extradata_size == 36)
-            st->codec->frame_size = AV_RB32((st->codec->extradata+12));
+        if (st->codec->extradata_size == 36) {
+            st->codec->frame_size = AV_RB32(st->codec->extradata+12);
+            st->codec->channels   = AV_RB8 (st->codec->extradata+21);
+        }
         break;
     default:
         break;
@@ -1018,7 +1046,7 @@ static int mov_read_stsd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_stsc(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_stsc(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     MOVStreamContext *sc = st->priv_data;
@@ -1029,13 +1057,13 @@ static int mov_read_stsc(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
 
     entries = get_be32(pb);
 
-    if(entries >= UINT_MAX / sizeof(MOV_stsc_t))
+    if(entries >= UINT_MAX / sizeof(*sc->sample_to_chunk))
         return -1;
 
     dprintf(c->fc, "track[%i].stsc.entries = %i\n", c->fc->nb_streams-1, entries);
 
     sc->sample_to_chunk_sz = entries;
-    sc->sample_to_chunk = av_malloc(entries * sizeof(MOV_stsc_t));
+    sc->sample_to_chunk = av_malloc(entries * sizeof(*sc->sample_to_chunk));
     if (!sc->sample_to_chunk)
         return -1;
     for(i=0; i<entries; i++) {
@@ -1046,7 +1074,7 @@ static int mov_read_stsc(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_stss(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_stss(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     MOVStreamContext *sc = st->priv_data;
@@ -1074,7 +1102,7 @@ static int mov_read_stss(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_stsz(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_stsz(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     MOVStreamContext *sc = st->priv_data;
@@ -1104,7 +1132,7 @@ static int mov_read_stsz(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_stts(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_stts(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     MOVStreamContext *sc = st->priv_data;
@@ -1115,11 +1143,11 @@ static int mov_read_stts(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     get_byte(pb); /* version */
     get_be24(pb); /* flags */
     entries = get_be32(pb);
-    if(entries >= UINT_MAX / sizeof(MOV_stts_t))
+    if(entries >= UINT_MAX / sizeof(*sc->stts_data))
         return -1;
 
     sc->stts_count = entries;
-    sc->stts_data = av_malloc(entries * sizeof(MOV_stts_t));
+    sc->stts_data = av_malloc(entries * sizeof(*sc->stts_data));
     if (!sc->stts_data)
         return -1;
     dprintf(c->fc, "track[%i].stts.entries = %i\n", c->fc->nb_streams-1, entries);
@@ -1149,7 +1177,7 @@ static int mov_read_stts(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_ctts(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_ctts(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st = c->fc->streams[c->fc->nb_streams-1];
     MOVStreamContext *sc = st->priv_data;
@@ -1158,11 +1186,11 @@ static int mov_read_ctts(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     get_byte(pb); /* version */
     get_be24(pb); /* flags */
     entries = get_be32(pb);
-    if(entries >= UINT_MAX / sizeof(MOV_stts_t))
+    if(entries >= UINT_MAX / sizeof(*sc->ctts_data))
         return -1;
 
     sc->ctts_count = entries;
-    sc->ctts_data = av_malloc(entries * sizeof(MOV_stts_t));
+    sc->ctts_data = av_malloc(entries * sizeof(*sc->ctts_data));
     if (!sc->ctts_data)
         return -1;
     dprintf(c->fc, "track[%i].ctts.entries = %i\n", c->fc->nb_streams-1, entries);
@@ -1291,7 +1319,7 @@ static void mov_build_index(MOVContext *mov, AVStream *st)
     sc->sample_count = st->nb_index_entries;
 }
 
-static int mov_read_trak(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_trak(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     AVStream *st;
     MOVStreamContext *sc;
@@ -1366,50 +1394,71 @@ static int mov_read_trak(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static void mov_parse_udta_string(ByteIOContext *pb, char *str, int size)
+static int mov_read_ilst(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
-    uint16_t str_size = get_be16(pb); /* string length */;
-
-    get_be16(pb); /* skip language */
-    get_buffer(pb, str, FFMIN(size, str_size));
+    int ret;
+    c->itunes_metadata = 1;
+    ret = mov_read_default(c, pb, atom);
+    c->itunes_metadata = 0;
+    return ret;
 }
 
-static int mov_read_udta(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_meta(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
-    uint64_t end = url_ftell(pb) + atom.size;
-
-    while (url_ftell(pb) + 8 < end) {
-        uint32_t tag_size = get_be32(pb);
-        uint32_t tag      = get_le32(pb);
-        uint64_t next     = url_ftell(pb) + tag_size - 8;
-
-        if (tag_size < 8 || next > end) // stop if tag_size is wrong
-            break;
+    url_fskip(pb, 4); // version + flags
+    atom.size -= 4;
+    return mov_read_default(c, pb, atom);
+}
 
-        switch (tag) {
-        case MKTAG(0xa9,'n','a','m'):
-            mov_parse_udta_string(pb, c->fc->title,     sizeof(c->fc->title));
-            break;
-        case MKTAG(0xa9,'w','r','t'):
-            mov_parse_udta_string(pb, c->fc->author,    sizeof(c->fc->author));
-            break;
-        case MKTAG(0xa9,'c','p','y'):
-            mov_parse_udta_string(pb, c->fc->copyright, sizeof(c->fc->copyright));
-            break;
-        case MKTAG(0xa9,'i','n','f'):
-            mov_parse_udta_string(pb, c->fc->comment,   sizeof(c->fc->comment));
-            break;
-        default:
-            break;
-        }
+static int mov_read_trkn(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
+{
+    get_be32(pb); // type
+    get_be32(pb); // unknown
+    c->fc->track = get_be32(pb);
+    dprintf(c->fc, "%.4s %d\n", (char*)&atom.type, c->fc->track);
+    return 0;
+}
 
-        url_fseek(pb, next, SEEK_SET);
+static int mov_read_udta_string(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
+{
+    char *str = NULL;
+    int size;
+    uint16_t str_size;
+
+    if (c->itunes_metadata) {
+        int data_size = get_be32(pb);
+        int tag = get_le32(pb);
+        if (tag == MKTAG('d','a','t','a')) {
+            get_be32(pb); // type
+            get_be32(pb); // unknown
+            str_size = data_size - 16;
+        } else return 0;
+    } else {
+        str_size = get_be16(pb); // string length
+        get_be16(pb); // language
     }
-
+    switch (atom.type) {
+    case MKTAG(0xa9,'n','a','m'):
+        str = c->fc->title; size = sizeof(c->fc->title); break;
+    case MKTAG(0xa9,'A','R','T'):
+    case MKTAG(0xa9,'w','r','t'):
+        str = c->fc->author; size = sizeof(c->fc->author); break;
+    case MKTAG(0xa9,'c','p','y'):
+        str = c->fc->copyright; size = sizeof(c->fc->copyright); break;
+    case MKTAG(0xa9,'c','m','t'):
+    case MKTAG(0xa9,'i','n','f'):
+        str = c->fc->comment; size = sizeof(c->fc->comment); break;
+    case MKTAG(0xa9,'a','l','b'):
+        str = c->fc->album; size = sizeof(c->fc->album); break;
+    }
+    if (!str)
+        return 0;
+    get_buffer(pb, str, FFMIN(size, str_size));
+    dprintf(c->fc, "%.4s %s\n", (char*)&atom.type, str);
     return 0;
 }
 
-static int mov_read_tkhd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_tkhd(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     int i;
     int width;
@@ -1480,7 +1529,7 @@ static int mov_read_tkhd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_tfhd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_tfhd(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     MOVFragment *frag = &c->fragment;
     MOVTrackExt *trex = NULL;
@@ -1515,7 +1564,7 @@ static int mov_read_tfhd(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_trex(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_trex(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     MOVTrackExt *trex;
 
@@ -1535,7 +1584,7 @@ static int mov_read_trex(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return 0;
 }
 
-static int mov_read_trun(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_trun(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     MOVFragment *frag = &c->fragment;
     AVStream *st;
@@ -1606,7 +1655,7 @@ static int mov_read_trun(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
 /* this atom should be null (from specs), but some buggy files put the 'moov' atom inside it... */
 /* like the files created with Adobe Premiere 5.0, for samples see */
 /* http://graphics.tudelft.nl/~wouter/publications/soundtests/ */
-static int mov_read_wide(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_wide(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     int err;
 
@@ -1627,7 +1676,7 @@ static int mov_read_wide(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
     return err;
 }
 
-static int mov_read_cmov(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_cmov(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
 #ifdef CONFIG_ZLIB
     ByteIOContext ctx;
@@ -1680,7 +1729,7 @@ free_and_return:
 }
 
 /* edit list atom */
-static int mov_read_elst(MOVContext *c, ByteIOContext *pb, MOV_atom_t atom)
+static int mov_read_elst(MOVContext *c, ByteIOContext *pb, MOVAtom atom)
 {
     MOVStreamContext *sc = c->fc->streams[c->fc->nb_streams-1]->priv_data;
     int i, edit_count;
@@ -1715,10 +1764,12 @@ static const MOVParseTableEntry mov_default_parse_table[] = {
 { MKTAG('f','t','y','p'), mov_read_ftyp },
 { MKTAG('g','l','b','l'), mov_read_glbl },
 { MKTAG('h','d','l','r'), mov_read_hdlr },
+{ MKTAG('i','l','s','t'), mov_read_ilst },
 { MKTAG('j','p','2','h'), mov_read_extradata },
 { MKTAG('m','d','a','t'), mov_read_mdat },
 { MKTAG('m','d','h','d'), mov_read_mdhd },
 { MKTAG('m','d','i','a'), mov_read_default },
+{ MKTAG('m','e','t','a'), mov_read_meta },
 { MKTAG('m','i','n','f'), mov_read_default },
 { MKTAG('m','o','o','f'), mov_read_moof },
 { MKTAG('m','o','o','v'), mov_read_moov },
@@ -1727,6 +1778,7 @@ static const MOVParseTableEntry mov_default_parse_table[] = {
 { MKTAG('S','M','I',' '), mov_read_smi }, /* Sorenson extension ??? */
 { MKTAG('a','l','a','c'), mov_read_extradata }, /* alac specific atom */
 { MKTAG('a','v','c','C'), mov_read_glbl },
+{ MKTAG('p','a','s','p'), mov_read_pasp },
 { MKTAG('s','t','b','l'), mov_read_default },
 { MKTAG('s','t','c','o'), mov_read_stco },
 { MKTAG('s','t','s','c'), mov_read_stsc },
@@ -1739,12 +1791,21 @@ static const MOVParseTableEntry mov_default_parse_table[] = {
 { MKTAG('t','r','a','k'), mov_read_trak },
 { MKTAG('t','r','a','f'), mov_read_default },
 { MKTAG('t','r','e','x'), mov_read_trex },
+{ MKTAG('t','r','k','n'), mov_read_trkn },
 { MKTAG('t','r','u','n'), mov_read_trun },
-{ MKTAG('u','d','t','a'), mov_read_udta },
+{ MKTAG('u','d','t','a'), mov_read_default },
 { MKTAG('w','a','v','e'), mov_read_wave },
 { MKTAG('e','s','d','s'), mov_read_esds },
 { MKTAG('w','i','d','e'), mov_read_wide }, /* place holder */
 { MKTAG('c','m','o','v'), mov_read_cmov },
+{ MKTAG(0xa9,'n','a','m'), mov_read_udta_string },
+{ MKTAG(0xa9,'w','r','t'), mov_read_udta_string },
+{ MKTAG(0xa9,'c','p','y'), mov_read_udta_string },
+{ MKTAG(0xa9,'i','n','f'), mov_read_udta_string },
+{ MKTAG(0xa9,'i','n','f'), mov_read_udta_string },
+{ MKTAG(0xa9,'A','R','T'), mov_read_udta_string },
+{ MKTAG(0xa9,'a','l','b'), mov_read_udta_string },
+{ MKTAG(0xa9,'c','m','t'), mov_read_udta_string },
 { 0, NULL }
 };
 
@@ -1798,7 +1859,7 @@ static int mov_read_header(AVFormatContext *s, AVFormatParameters *ap)
     MOVContext *mov = s->priv_data;
     ByteIOContext *pb = s->pb;
     int err;
-    MOV_atom_t atom = { 0, 0, 0 };
+    MOVAtom atom = { 0, 0, 0 };
 
     mov->fc = s;
     /* .mov and .mp4 aren't streamable anyway (only progressive download if moov is before mdat) */
@@ -1851,7 +1912,7 @@ static int mov_read_packet(AVFormatContext *s, AVPacket *pkt)
     if (!sample) {
         mov->found_mdat = 0;
         if (!url_is_streamed(s->pb) ||
-            mov_read_default(mov, s->pb, (MOV_atom_t){ 0, 0, INT64_MAX }) < 0 ||
+            mov_read_default(mov, s->pb, (MOVAtom){ 0, 0, INT64_MAX }) < 0 ||
             url_feof(s->pb))
             return -1;
         dprintf(s, "read fragments, offset 0x%llx\n", url_ftell(s->pb));
diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index 9588bb6..6ddbed1 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -723,7 +723,7 @@ static int mov_write_stsd_tag(ByteIOContext *pb, MOVTrack *track)
 
 static int mov_write_ctts_tag(ByteIOContext *pb, MOVTrack *track)
 {
-    MOV_stts_t *ctts_entries;
+    MOVStts *ctts_entries;
     uint32_t entries = 0;
     uint32_t atom_size;
     int i;
@@ -757,7 +757,7 @@ static int mov_write_ctts_tag(ByteIOContext *pb, MOVTrack *track)
 /* Time to sample atom */
 static int mov_write_stts_tag(ByteIOContext *pb, MOVTrack *track)
 {
-    MOV_stts_t *stts_entries;
+    MOVStts *stts_entries;
     uint32_t entries = -1;
     uint32_t atom_size;
     int i;
diff --git a/libavformat/mp3.c b/libavformat/mp3.c
index 407d4f0..a4eb49c 100644
--- a/libavformat/mp3.c
+++ b/libavformat/mp3.c
@@ -383,7 +383,7 @@ static int mp3_read_probe(AVProbeData *p)
 
         for(frames = 0; buf2 < end; frames++) {
             header = AV_RB32(buf2);
-            fsize = ff_mpa_decode_header(&avctx, header, &sample_rate);
+            fsize = ff_mpa_decode_header(&avctx, header, &sample_rate, &sample_rate, &sample_rate, &sample_rate);
             if(fsize < 0)
                 break;
             buf2 += fsize;
@@ -402,20 +402,22 @@ static int mp3_read_probe(AVProbeData *p)
 /**
  * Try to find Xing/Info/VBRI tags and compute duration from info therein
  */
-static void mp3_parse_vbr_tags(AVFormatContext *s, AVStream *st, int64_t base)
+static int mp3_parse_vbr_tags(AVFormatContext *s, AVStream *st, int64_t base)
 {
     uint32_t v, spf;
     int frames = -1; /* Total number of frames in file */
     const int64_t xing_offtbl[2][2] = {{32, 17}, {17,9}};
     MPADecodeContext c;
+    int vbrtag_size = 0;
 
     v = get_be32(s->pb);
     if(ff_mpa_check_header(v) < 0)
-      return;
+      return -1;
 
-    ff_mpegaudio_decode_header(&c, v);
+    if (ff_mpegaudio_decode_header(&c, v) == 0)
+        vbrtag_size = c.frame_size;
     if(c.layer != 3)
-        return;
+        return -1;
 
     /* Check for Xing / Info tag */
     url_fseek(s->pb, xing_offtbl[c.lsf == 1][c.nb_channels == 1], SEEK_CUR);
@@ -439,11 +441,15 @@ static void mp3_parse_vbr_tags(AVFormatContext *s, AVStream *st, int64_t base)
     }
 
     if(frames < 0)
-        return;
+        return -1;
+
+    /* Skip the vbr tag frame */
+    url_fseek(s->pb, base + vbrtag_size, SEEK_SET);
 
     spf = c.lsf ? 576 : 1152; /* Samples per frame, layer 3 */
     st->duration = av_rescale_q(frames, (AVRational){spf, c.sample_rate},
                                 st->time_base);
+    return 0;
 }
 
 static int mp3_read_header(AVFormatContext *s,
@@ -493,8 +499,8 @@ static int mp3_read_header(AVFormatContext *s,
     }
 
     off = url_ftell(s->pb);
-    mp3_parse_vbr_tags(s, st, off);
-    url_fseek(s->pb, off, SEEK_SET);
+    if (mp3_parse_vbr_tags(s, st, off) < 0)
+        url_fseek(s->pb, off, SEEK_SET);
 
     /* the parameters will be extracted from the compressed bitstream */
     return 0;
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index 48aca43..00ce336 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -22,6 +22,7 @@
 #include "libavutil/crc.h"
 #include "avformat.h"
 #include "mpegts.h"
+#include "internal.h"
 
 //#define DEBUG_SI
 //#define DEBUG_SEEK
@@ -38,8 +39,6 @@ typedef struct PESContext PESContext;
 
 static PESContext* add_pes_stream(MpegTSContext *ts, int pid, int pcr_pid, int stream_type);
 static AVStream* new_pes_av_stream(PESContext *pes, uint32_t code);
-extern void av_set_program_name(AVProgram *program, char *provider_name, char *name);
-extern void av_program_add_stream_index(AVFormatContext *ac, int progid, unsigned int idx);
 
 enum MpegTSFilterType {
     MPEGTS_PES,
@@ -80,11 +79,11 @@ struct MpegTSFilter {
 };
 
 #define MAX_PIDS_PER_PROGRAM 64
-typedef struct {
+struct Program {
     unsigned int id; //program id/service id
     unsigned int nb_pids;
     unsigned int pids[MAX_PIDS_PER_PROGRAM];
-} Program_t;
+};
 
 struct MpegTSContext {
     /* user data */
@@ -114,7 +113,7 @@ struct MpegTSContext {
     /* scan context */
     /** structure to keep track of Program->pids mapping     */
     unsigned int nb_prg;
-    Program_t *prg;
+    struct Program *prg;
 
 
     /** filters for various streams specified by PMT + for the PAT and PMT */
@@ -169,8 +168,8 @@ static void clear_programs(MpegTSContext *ts)
 
 static void add_pat_entry(MpegTSContext *ts, unsigned int programid)
 {
-    Program_t *p;
-    void *tmp = av_realloc(ts->prg, (ts->nb_prg+1)*sizeof(Program_t));
+    struct Program *p;
+    void *tmp = av_realloc(ts->prg, (ts->nb_prg+1)*sizeof(struct Program));
     if(!tmp)
         return;
     ts->prg = tmp;
@@ -183,7 +182,7 @@ static void add_pat_entry(MpegTSContext *ts, unsigned int programid)
 static void add_pid_to_pmt(MpegTSContext *ts, unsigned int programid, unsigned int pid)
 {
     int i;
-    Program_t *p = NULL;
+    struct Program *p = NULL;
     for(i=0; i<ts->nb_prg; i++) {
         if(ts->prg[i].id == programid) {
             p = &ts->prg[i];
@@ -210,7 +209,7 @@ static int discard_pid(MpegTSContext *ts, unsigned int pid)
 {
     int i, j, k;
     int used = 0, discarded = 0;
-    Program_t *p;
+    struct Program *p;
     for(i=0; i<ts->nb_prg; i++) {
         p = &ts->prg[i];
         for(j=0; j<p->nb_pids; j++) {
diff --git a/libavformat/mtv.c b/libavformat/mtv.c
index 75da0e7..7bf0d84 100644
--- a/libavformat/mtv.c
+++ b/libavformat/mtv.c
@@ -36,24 +36,23 @@
 
 typedef struct MTVDemuxContext {
 
-    unsigned int        file_size;         ///< filesize, not always right
-    unsigned int        segments;          ///< number of 512 byte segments
-    unsigned int        audio_identifier;  ///< 'MP3' on all files I have seen
-    unsigned int        audio_br;          ///< bitrate of audio chanel (mp3)
-    unsigned int        img_colorfmt;      ///< frame colorfmt rgb 565/555
-    unsigned int        img_bpp;           ///< frame bits per pixel
-    unsigned int        img_width;         //
-    unsigned int        img_height;        //
-    unsigned int        img_segment_size;  ///< size of image segment
-    unsigned int        video_fps;         //
-    unsigned int        full_segment_size;
+    unsigned int file_size;         ///< filesize, not always right
+    unsigned int segments;          ///< number of 512 byte segments
+    unsigned int audio_identifier;  ///< 'MP3' on all files I have seen
+    unsigned int audio_br;          ///< bitrate of audio chanel (mp3)
+    unsigned int img_colorfmt;      ///< frame colorfmt rgb 565/555
+    unsigned int img_bpp;           ///< frame bits per pixel
+    unsigned int img_width;         //
+    unsigned int img_height;        //
+    unsigned int img_segment_size;  ///< size of image segment
+    unsigned int video_fps;         //
+    unsigned int full_segment_size;
 
 } MTVDemuxContext;
 
 static int mtv_probe(AVProbeData *p)
 {
     /* Magic is 'AMV' */
-
     if(*(p->buf) != 'A' || *(p->buf+1) != 'M' || *(p->buf+2) != 'V')
         return 0;
 
@@ -62,11 +61,10 @@ static int mtv_probe(AVProbeData *p)
 
 static int mtv_read_header(AVFormatContext *s, AVFormatParameters *ap)
 {
-    MTVDemuxContext    *mtv = s->priv_data;
-    ByteIOContext      *pb  = s->pb;
-    AVStream           *st;
-    unsigned int        audio_subsegments;
-
+    MTVDemuxContext *mtv = s->priv_data;
+    ByteIOContext   *pb  = s->pb;
+    AVStream        *st;
+    unsigned int    audio_subsegments;
 
     url_fskip(pb, 3);
     mtv->file_size         = get_le32(pb);
@@ -86,11 +84,11 @@ static int mtv_read_header(AVFormatContext *s, AVFormatParameters *ap)
         mtv->img_segment_size;
     mtv->video_fps         = (mtv->audio_br / 4) / audio_subsegments;
 
-    /* FIXME Add sanity check here */
+    // FIXME Add sanity check here
 
-    /* all systems go! init decoders */
+    // all systems go! init decoders
 
-    /* video - raw rgb565 */
+    // video - raw rgb565
 
     st = av_new_stream(s, VIDEO_SID);
     if(!st)
@@ -105,7 +103,7 @@ static int mtv_read_header(AVFormatContext *s, AVFormatParameters *ap)
     st->codec->bits_per_coded_sample = mtv->img_bpp;
     st->codec->sample_rate     = mtv->video_fps;
 
-    /* audio - mp3 */
+    // audio - mp3
 
     st = av_new_stream(s, AUDIO_SID);
     if(!st)
@@ -117,7 +115,7 @@ static int mtv_read_header(AVFormatContext *s, AVFormatParameters *ap)
     st->codec->bit_rate        = mtv->audio_br;
     st->need_parsing           = AVSTREAM_PARSE_FULL;
 
-    /* Jump over header */
+    // Jump over header
 
     if(url_fseek(pb, MTV_HEADER_SIZE, SEEK_SET) != MTV_HEADER_SIZE)
         return AVERROR(EIO);
diff --git a/libavformat/mxf.h b/libavformat/mxf.h
index 4c17a9e..39c8c16 100644
--- a/libavformat/mxf.h
+++ b/libavformat/mxf.h
@@ -41,6 +41,7 @@ enum MXFMetadataSetType {
     Identification,
     ContentStorage,
     SubDescriptor,
+    IndexTableSegment,
     TypeBottom,// add metadata type before this
 };
 
diff --git a/libavformat/mxfdec.c b/libavformat/mxfdec.c
index 98cf41d..6dafdc6 100644
--- a/libavformat/mxfdec.c
+++ b/libavformat/mxfdec.c
@@ -104,6 +104,11 @@ typedef struct {
 typedef struct {
     UID uid;
     enum MXFMetadataSetType type;
+} MXFIndexTableSegment;
+
+typedef struct {
+    UID uid;
+    enum MXFMetadataSetType type;
     UID package_uid;
     UID *tracks_refs;
     int tracks_count;
@@ -300,6 +305,7 @@ static int mxf_read_packet(AVFormatContext *s, AVPacket *pkt)
         if (klv_read_packet(&klv, s->pb) < 0)
             return -1;
         PRINT_KEY(s, "read packet", klv.key);
+        dprintf(s, "size %lld offset %#llx\n", klv.length, klv.offset);
         if (IS_KLV_KEY(klv.key, mxf_encrypted_triplet_key)) {
             int res = mxf_decrypt_triplet(s, pkt, &klv);
             if (res < 0) {
@@ -496,13 +502,26 @@ static int mxf_read_source_package(MXFPackage *package, ByteIOContext *pb, int t
     return 0;
 }
 
+static int mxf_read_index_table_segment(MXFIndexTableSegment *segment, ByteIOContext *pb, int tag)
+{
+    switch(tag) {
+    case 0x3F05: dprintf(NULL, "EditUnitByteCount %d\n", get_be32(pb)); break;
+    case 0x3F06: dprintf(NULL, "IndexSID %d\n", get_be32(pb)); break;
+    case 0x3F07: dprintf(NULL, "BodySID %d\n", get_be32(pb)); break;
+    case 0x3F0B: dprintf(NULL, "IndexEditRate %d/%d\n", get_be32(pb), get_be32(pb)); break;
+    case 0x3F0C: dprintf(NULL, "IndexStartPosition %lld\n", get_be64(pb)); break;
+    case 0x3F0D: dprintf(NULL, "IndexDuration %lld\n", get_be64(pb)); break;
+    }
+    return 0;
+}
+
 static void mxf_read_pixel_layout(ByteIOContext *pb, MXFDescriptor *descriptor)
 {
     int code;
 
     do {
         code = get_byte(pb);
-        dprintf(NULL, "pixel layout: code 0x%x\n", code);
+        dprintf(NULL, "pixel layout: code %#x\n", code);
         switch (code) {
         case 0x52: /* R */
             descriptor->bits_per_sample += get_byte(pb);
@@ -837,6 +856,7 @@ static const MXFMetadataReadTableEntry mxf_metadata_read_table[] = {
     { { 0x06,0x0E,0x2B,0x34,0x02,0x53,0x01,0x01,0x0d,0x01,0x01,0x01,0x01,0x01,0x3A,0x00 }, mxf_read_track, sizeof(MXFTrack), Track }, /* Static Track */
     { { 0x06,0x0E,0x2B,0x34,0x02,0x53,0x01,0x01,0x0d,0x01,0x01,0x01,0x01,0x01,0x3B,0x00 }, mxf_read_track, sizeof(MXFTrack), Track }, /* Generic Track */
     { { 0x06,0x0E,0x2B,0x34,0x02,0x53,0x01,0x01,0x0d,0x01,0x04,0x01,0x02,0x02,0x00,0x00 }, mxf_read_cryptographic_context, sizeof(MXFCryptoContext), CryptoContext },
+    { { 0x06,0x0E,0x2B,0x34,0x02,0x53,0x01,0x01,0x0d,0x01,0x02,0x01,0x01,0x10,0x01,0x00 }, mxf_read_index_table_segment, sizeof(MXFIndexTableSegment), IndexTableSegment },
     { { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 }, NULL, 0, AnyType },
 };
 
@@ -854,8 +874,9 @@ static int mxf_read_local_tags(MXFContext *mxf, KLVPacket *klv, int (*read_child
         uint64_t next = url_ftell(pb) + size;
         UID uid = {0};
 
+        dprintf(mxf->fc, "local tag %#04x size %d\n", tag, size);
         if (!size) { /* ignore empty tag, needed for some files with empty UMID tag */
-            av_log(mxf->fc, AV_LOG_ERROR, "local tag 0x%04X with 0 size\n", tag);
+            av_log(mxf->fc, AV_LOG_ERROR, "local tag %#04x with 0 size\n", tag);
             continue;
         }
         if (tag > 0x7FFF) { /* dynamic tag */
@@ -864,7 +885,7 @@ static int mxf_read_local_tags(MXFContext *mxf, KLVPacket *klv, int (*read_child
                 int local_tag = AV_RB16(mxf->local_tags+i*18);
                 if (local_tag == tag) {
                     memcpy(uid, mxf->local_tags+i*18+2, 16);
-                    dprintf(mxf->fc, "local tag 0x%04X\n", local_tag);
+                    dprintf(mxf->fc, "local tag %#04x\n", local_tag);
                     PRINT_KEY(mxf->fc, "uid", uid);
                 }
             }
@@ -897,6 +918,7 @@ static int mxf_read_header(AVFormatContext *s, AVFormatParameters *ap)
         if (klv_read_packet(&klv, s->pb) < 0)
             return -1;
         PRINT_KEY(s, "read header", klv.key);
+        dprintf(s, "size %lld offset %#llx\n", klv.length, klv.offset);
         if (IS_KLV_KEY(klv.key, mxf_encrypted_triplet_key) ||
             IS_KLV_KEY(klv.key, mxf_essence_element_key)) {
             /* FIXME avoid seek */
diff --git a/libavformat/nut.c b/libavformat/nut.c
index 7a978a5..6fdc298 100644
--- a/libavformat/nut.c
+++ b/libavformat/nut.c
@@ -47,16 +47,16 @@ int64_t ff_lsb2full(StreamContext *stream, int64_t lsb){
     return  ((lsb - delta)&mask) + delta;
 }
 
-int ff_nut_sp_pos_cmp(syncpoint_t *a, syncpoint_t *b){
+int ff_nut_sp_pos_cmp(Syncpoint *a, Syncpoint *b){
     return ((a->pos - b->pos) >> 32) - ((b->pos - a->pos) >> 32);
 }
 
-int ff_nut_sp_pts_cmp(syncpoint_t *a, syncpoint_t *b){
+int ff_nut_sp_pts_cmp(Syncpoint *a, Syncpoint *b){
     return ((a->ts - b->ts) >> 32) - ((b->ts - a->ts) >> 32);
 }
 
 void ff_nut_add_sp(NUTContext *nut, int64_t pos, int64_t back_ptr, int64_t ts){
-    syncpoint_t *sp= av_mallocz(sizeof(syncpoint_t));
+    Syncpoint *sp= av_mallocz(sizeof(Syncpoint));
     struct AVTreeNode *node= av_mallocz(av_tree_node_size);
 
     sp->pos= pos;
diff --git a/libavformat/nut.h b/libavformat/nut.h
index 713d27d..a1081ed 100644
--- a/libavformat/nut.h
+++ b/libavformat/nut.h
@@ -50,14 +50,14 @@ typedef enum{
     FLAG_MATCH_TIME =2048, ///<If set, match_time_delta is coded in the frame header
     FLAG_CODED      =4096, ///<if set, coded_flags are stored in the frame header
     FLAG_INVALID    =8192, ///<if set, frame_code is invalid
-}flag_t;
+} Flag;
 
 typedef struct {
     uint64_t pos;
     uint64_t back_ptr;
 //    uint64_t global_key_pts;
     int64_t ts;
-} syncpoint_t;
+} Syncpoint;
 
 typedef struct {
     uint16_t flags;
@@ -67,7 +67,7 @@ typedef struct {
     int16_t  pts_delta;
     uint8_t  reserved_count;
     uint8_t  header_idx;
-} FrameCode; // maybe s/FrameCode/framecode_t/ or change all to Java style but do not mix
+} FrameCode;
 
 typedef struct {
     int last_flags;
@@ -78,7 +78,7 @@ typedef struct {
     int msb_pts_shift;
     int max_pts_distance;
     int decode_delay; //FIXME duplicate of has_b_frames
-} StreamContext;// maybe s/StreamContext/streamcontext_t/
+} StreamContext;
 
 typedef struct {
     AVFormatContext *avf;
@@ -106,8 +106,8 @@ typedef struct {
 
 void ff_nut_reset_ts(NUTContext *nut, AVRational time_base, int64_t val);
 int64_t ff_lsb2full(StreamContext *stream, int64_t lsb);
-int ff_nut_sp_pos_cmp(syncpoint_t *a, syncpoint_t *b);
-int ff_nut_sp_pts_cmp(syncpoint_t *a, syncpoint_t *b);
+int ff_nut_sp_pos_cmp(Syncpoint *a, Syncpoint *b);
+int ff_nut_sp_pts_cmp(Syncpoint *a, Syncpoint *b);
 void ff_nut_add_sp(NUTContext *nut, int64_t pos, int64_t back_ptr, int64_t ts);
 
 extern const Dispositions ff_nut_dispositions[];
diff --git a/libavformat/nutdec.c b/libavformat/nutdec.c
index cf41f62..098bc84 100644
--- a/libavformat/nutdec.c
+++ b/libavformat/nutdec.c
@@ -846,9 +846,9 @@ assert(0);
 static int read_seek(AVFormatContext *s, int stream_index, int64_t pts, int flags){
     NUTContext *nut = s->priv_data;
     AVStream *st= s->streams[stream_index];
-    syncpoint_t dummy={.ts= pts*av_q2d(st->time_base)*AV_TIME_BASE};
-    syncpoint_t nopts_sp= {.ts= AV_NOPTS_VALUE, .back_ptr= AV_NOPTS_VALUE};
-    syncpoint_t *sp, *next_node[2]= {&nopts_sp, &nopts_sp};
+    Syncpoint dummy={.ts= pts*av_q2d(st->time_base)*AV_TIME_BASE};
+    Syncpoint nopts_sp= {.ts= AV_NOPTS_VALUE, .back_ptr= AV_NOPTS_VALUE};
+    Syncpoint *sp, *next_node[2]= {&nopts_sp, &nopts_sp};
     int64_t pos, pos2, ts;
     int i;
 
diff --git a/libavformat/nutenc.c b/libavformat/nutenc.c
index 478fc4f..54d4b07 100644
--- a/libavformat/nutenc.c
+++ b/libavformat/nutenc.c
@@ -260,7 +260,7 @@ static void put_v(ByteIOContext *bc, uint64_t val){
     put_byte(bc, val&127);
 }
 
-static void put_t(NUTContext *nut, StreamContext *nus, ByteIOContext *bc, uint64_t val){
+static void put_tt(NUTContext *nut, StreamContext *nus, ByteIOContext *bc, uint64_t val){
     val *= nut->time_base_count;
     val += nus->time_base - nut->time_base;
     put_v(bc, val);
@@ -664,7 +664,7 @@ static int write_packet(AVFormatContext *s, AVPacket *pkt){
 //FIXME: Ensure store_sp is 1 in the first place.
 
     if(store_sp){
-        syncpoint_t *sp, dummy= {.pos= INT64_MAX};
+        Syncpoint *sp, dummy= {.pos= INT64_MAX};
 
         ff_nut_reset_ts(nut, *nus->time_base, pkt->dts);
         for(i=0; i<s->nb_streams; i++){
@@ -684,7 +684,7 @@ static int write_packet(AVFormatContext *s, AVPacket *pkt){
         ret = url_open_dyn_buf(&dyn_bc);
         if(ret < 0)
             return ret;
-        put_t(nut, nus, dyn_bc, pkt->dts);
+        put_tt(nut, nus, dyn_bc, pkt->dts);
         put_v(dyn_bc, sp ? (nut->last_syncpoint_pos - sp->pos)>>4 : 0);
         put_packet(nut, bc, dyn_bc, 1, SYNCPOINT_STARTCODE);
 
diff --git a/libavformat/nuv.c b/libavformat/nuv.c
index bd98839..7a01e68 100644
--- a/libavformat/nuv.c
+++ b/libavformat/nuv.c
@@ -33,7 +33,7 @@ typedef enum {
     NUV_AUDIO = 'A',
     NUV_SEEKP = 'R',
     NUV_MYTHEXT = 'X'
-} frametype_t;
+} nuv_frametype;
 
 static int nuv_probe(AVProbeData *p) {
     if (!memcmp(p->buf, "NuppelVideo", 12))
@@ -55,7 +55,7 @@ static int nuv_probe(AVProbeData *p) {
  */
 static int get_codec_data(ByteIOContext *pb, AVStream *vst,
                           AVStream *ast, int myth) {
-    frametype_t frametype;
+    nuv_frametype frametype;
     if (!vst && !myth)
         return 1; // no codec data needed
     while (!url_feof(pb)) {
@@ -191,7 +191,7 @@ static int nuv_packet(AVFormatContext *s, AVPacket *pkt) {
     NUVContext *ctx = s->priv_data;
     ByteIOContext *pb = s->pb;
     uint8_t hdr[HDRSIZE];
-    frametype_t frametype;
+    nuv_frametype frametype;
     int ret, size;
     while (!url_feof(pb)) {
         int copyhdrsize = ctx->rtjpg_video ? HDRSIZE : 0;
diff --git a/libavformat/oggdec.h b/libavformat/oggdec.h
index 1a73fe0..d238e99 100644
--- a/libavformat/oggdec.h
+++ b/libavformat/oggdec.h
@@ -85,6 +85,6 @@ extern const struct ogg_codec ff_speex_codec;
 extern const struct ogg_codec ff_theora_codec;
 extern const struct ogg_codec ff_vorbis_codec;
 
-extern int vorbis_comment(AVFormatContext *ms, uint8_t *buf, int size);
+int vorbis_comment(AVFormatContext *ms, uint8_t *buf, int size);
 
 #endif /* AVFORMAT_OGGDEC_H */
diff --git a/libavformat/oggparseflac.c b/libavformat/oggparseflac.c
index b1332c6..802453a 100644
--- a/libavformat/oggparseflac.c
+++ b/libavformat/oggparseflac.c
@@ -38,21 +38,21 @@ flac_header (AVFormatContext * s, int idx)
         return 0;
 
     init_get_bits(&gb, os->buf + os->pstart, os->psize*8);
-    get_bits(&gb, 1); /* metadata_last */
+    skip_bits1(&gb); /* metadata_last */
     mdt = get_bits(&gb, 7);
 
     if (mdt == 0x7f) {
-        skip_bits(&gb, 4*8); /* "FLAC" */
+        skip_bits_long(&gb, 4*8); /* "FLAC" */
         if(get_bits(&gb, 8) != 1) /* unsupported major version */
             return -1;
-        skip_bits(&gb, 8 + 16);      /* minor version + header count */
-        skip_bits(&gb, 4*8); /* "fLaC" */
+        skip_bits_long(&gb, 8 + 16); /* minor version + header count */
+        skip_bits_long(&gb, 4*8); /* "fLaC" */
 
         /* METADATA_BLOCK_HEADER */
         if (get_bits_long(&gb, 32) != FLAC_STREAMINFO_SIZE)
             return -1;
 
-        skip_bits(&gb, 16*2+24*2);
+        skip_bits_long(&gb, 16*2+24*2);
 
         st->codec->sample_rate = get_bits_long(&gb, 20);
         st->codec->channels = get_bits(&gb, 3) + 1;
diff --git a/libavformat/oggparsevorbis.c b/libavformat/oggparsevorbis.c
index 1f0bcfe..a1eb20d 100644
--- a/libavformat/oggparsevorbis.c
+++ b/libavformat/oggparsevorbis.c
@@ -30,7 +30,7 @@
 #include "avformat.h"
 #include "oggdec.h"
 
-extern int
+int
 vorbis_comment(AVFormatContext * as, uint8_t *buf, int size)
 {
     const uint8_t *p = buf;
diff --git a/libavformat/os_support.h b/libavformat/os_support.h
index 2fd3112..0b6a292 100644
--- a/libavformat/os_support.h
+++ b/libavformat/os_support.h
@@ -93,7 +93,7 @@ struct pollfd {
 #define POLLNVAL   0x1000  /* invalid file descriptor */
 
 
-extern int poll(struct pollfd *fds, nfds_t numfds, int timeout);
+int poll(struct pollfd *fds, nfds_t numfds, int timeout);
 #endif /* HAVE_POLL_H */
 #endif /* CONFIG_FFSERVER */
 #endif /* CONFIG_NETWORK */
diff --git a/libavformat/raw.c b/libavformat/raw.c
index f2a886c..b93cdcc 100644
--- a/libavformat/raw.c
+++ b/libavformat/raw.c
@@ -577,6 +577,38 @@ static int flac_probe(AVProbeData *p)
 }
 #endif
 
+#ifdef CONFIG_AAC_DEMUXER
+static int adts_aac_probe(AVProbeData *p)
+{
+    int max_frames = 0, first_frames = 0;
+    int fsize, frames;
+    uint8_t *buf2;
+    uint8_t *buf = p->buf;
+    uint8_t *end = buf + p->buf_size - 7;
+
+    for(; buf < end; buf= buf2+1) {
+        buf2 = buf;
+
+        for(frames = 0; buf2 < end; frames++) {
+            uint32_t header = AV_RB16(buf2);
+            if((header&0xFFF6) != 0xFFF0)
+                break;
+            fsize = (AV_RB32(buf2+3)>>13) & 0x8FFF;
+            if(fsize < 7)
+                break;
+            buf2 += fsize;
+        }
+        max_frames = FFMAX(max_frames, frames);
+        if(buf == p->buf)
+            first_frames= frames;
+    }
+    if   (first_frames>=3) return AVPROBE_SCORE_MAX/2+1;
+    else if(max_frames>500)return AVPROBE_SCORE_MAX/2;
+    else if(max_frames>=3) return AVPROBE_SCORE_MAX/4;
+    else if(max_frames>=1) return 1;
+    else                   return 0;
+}
+#endif
 
 /* Note: Do not forget to add new entries to the Makefile as well. */
 
@@ -585,7 +617,7 @@ AVInputFormat aac_demuxer = {
     "aac",
     NULL_IF_CONFIG_SMALL("ADTS AAC"),
     0,
-    NULL,
+    adts_aac_probe,
     audio_read_header,
     raw_read_partial_packet,
     .flags= AVFMT_GENERIC_INDEX,
@@ -891,7 +923,7 @@ AVInputFormat m4v_demuxer = {
     video_read_header,
     raw_read_partial_packet,
     .flags= AVFMT_GENERIC_INDEX,
-    .extensions = "m4v", //FIXME remove after writing mpeg4_probe
+    .extensions = "m4v",
     .value = CODEC_ID_MPEG4,
 };
 #endif
diff --git a/libavformat/rdt.c b/libavformat/rdt.c
index daf8cbe..3680527 100644
--- a/libavformat/rdt.c
+++ b/libavformat/rdt.c
@@ -33,17 +33,25 @@
 #include "libavutil/md5.h"
 #include "rm.h"
 #include "internal.h"
+#include <libavcodec/bitstream.h>
 
 struct RDTDemuxContext {
-    AVFormatContext *ic;
-    AVStream *st;
+    AVFormatContext *ic; /**< the containing (RTSP) demux context */
+    /** Each RDT stream-set (represented by one RTSPStream) can contain
+     * multiple streams (of the same content, but with possibly different
+     * codecs/bitrates). Each such stream is represented by one AVStream
+     * in the AVFormatContext, and this variable points to the offset in
+     * that array such that the first is the first stream of this set. */
+    AVStream **streams;
+    int n_streams; /**< streams with identifical content in this set */
     void *dynamic_protocol_context;
     DynamicPayloadPacketHandlerProc parse_packet;
-    uint32_t prev_sn, prev_ts;
+    uint32_t prev_timestamp;
+    int prev_set_id, prev_stream_id;
 };
 
 RDTDemuxContext *
-ff_rdt_parse_open(AVFormatContext *ic, AVStream *st,
+ff_rdt_parse_open(AVFormatContext *ic, int first_stream_of_set_idx,
                   void *priv_data, RTPDynamicProtocolHandler *handler)
 {
     RDTDemuxContext *s = av_mallocz(sizeof(RDTDemuxContext));
@@ -51,9 +59,14 @@ ff_rdt_parse_open(AVFormatContext *ic, AVStream *st,
         return NULL;
 
     s->ic = ic;
-    s->st = st;
-    s->prev_sn = -1;
-    s->prev_ts = -1;
+    s->streams = &ic->streams[first_stream_of_set_idx];
+    do {
+        s->n_streams++;
+    } while (first_stream_of_set_idx + s->n_streams < ic->nb_streams &&
+             s->streams[s->n_streams]->priv_data == s->streams[0]->priv_data);
+    s->prev_set_id    = -1;
+    s->prev_stream_id = -1;
+    s->prev_timestamp = -1;
     s->parse_packet = handler->parse_packet;
     s->dynamic_protocol_context = priv_data;
 
@@ -63,14 +76,21 @@ ff_rdt_parse_open(AVFormatContext *ic, AVStream *st,
 void
 ff_rdt_parse_close(RDTDemuxContext *s)
 {
+    int i;
+
+    for (i = 1; i < s->n_streams; i++)
+        s->streams[i]->priv_data = NULL;
+
     av_free(s);
 }
 
 struct PayloadContext {
     AVFormatContext *rmctx;
+    RMStream *rmst[MAX_STREAMS];
     uint8_t *mlti_data;
     unsigned int mlti_data_size;
     char buffer[RTP_MAX_PACKET_LENGTH + FF_INPUT_BUFFER_PADDING_SIZE];
+    int audio_pkt_cnt; /**< remaining audio packets in rmdec */
 };
 
 void
@@ -115,7 +135,7 @@ ff_rdt_calc_response_and_checksum(char response[41], char chksum[9],
 static int
 rdt_load_mdpr (PayloadContext *rdt, AVStream *st, int rule_nr)
 {
-    ByteIOContext *pb;
+    ByteIOContext pb;
     int size;
     uint32_t tag;
 
@@ -135,35 +155,34 @@ rdt_load_mdpr (PayloadContext *rdt, AVStream *st, int rule_nr)
      */
     if (!rdt->mlti_data)
         return -1;
-    url_open_buf(&pb, rdt->mlti_data, rdt->mlti_data_size, URL_RDONLY);
-    tag = get_le32(pb);
+    init_put_byte(&pb, rdt->mlti_data, rdt->mlti_data_size, 0,
+                  NULL, NULL, NULL, NULL);
+    tag = get_le32(&pb);
     if (tag == MKTAG('M', 'L', 'T', 'I')) {
         int num, chunk_nr;
 
         /* read index of MDPR chunk numbers */
-        num = get_be16(pb);
+        num = get_be16(&pb);
         if (rule_nr < 0 || rule_nr >= num)
             return -1;
-        url_fskip(pb, rule_nr * 2);
-        chunk_nr = get_be16(pb);
-        url_fskip(pb, (num - 1 - rule_nr) * 2);
+        url_fskip(&pb, rule_nr * 2);
+        chunk_nr = get_be16(&pb);
+        url_fskip(&pb, (num - 1 - rule_nr) * 2);
 
         /* read MDPR chunks */
-        num = get_be16(pb);
+        num = get_be16(&pb);
         if (chunk_nr >= num)
             return -1;
         while (chunk_nr--)
-            url_fskip(pb, get_be32(pb));
-        size = get_be32(pb);
+            url_fskip(&pb, get_be32(&pb));
+        size = get_be32(&pb);
     } else {
         size = rdt->mlti_data_size;
-        url_fseek(pb, 0, SEEK_SET);
+        url_fseek(&pb, 0, SEEK_SET);
     }
-    rdt->rmctx->pb = pb;
-    if (ff_rm_read_mdpr_codecdata(rdt->rmctx, st, size) < 0)
+    if (ff_rm_read_mdpr_codecdata(rdt->rmctx, &pb, st, rdt->rmst[st->index], size) < 0)
         return -1;
 
-    url_close_buf(pb);
     return 0;
 }
 
@@ -173,16 +192,27 @@ rdt_load_mdpr (PayloadContext *rdt, AVStream *st, int rule_nr)
 
 int
 ff_rdt_parse_header(const uint8_t *buf, int len,
-                    int *sn, int *seq, int *rn, uint32_t *ts)
+                    int *pset_id, int *pseq_no, int *pstream_id,
+                    int *pis_keyframe, uint32_t *ptimestamp)
 {
-    int consumed = 10;
+    GetBitContext gb;
+    int consumed = 0, set_id, seq_no, stream_id, is_keyframe,
+        len_included, need_reliable;
+    uint32_t timestamp;
+
+    /* skip status packets */
+    while (len >= 5 && buf[1] == 0xFF /* status packet */) {
+        int pkt_len;
+
+        if (!(buf[0] & 0x80))
+            return -1; /* not followed by a data packet */
 
-    if (len > 0 && (buf[0] < 0x40 || buf[0] > 0x42)) {
-        buf += 9;
-        len -= 9;
-        consumed += 9;
+        pkt_len = AV_RB16(buf+3);
+        buf += pkt_len;
+        len -= pkt_len;
+        consumed += pkt_len;
     }
-    if (len < 10)
+    if (len < 16)
         return -1;
     /**
      * Layout of the header (in bits):
@@ -235,12 +265,32 @@ ff_rdt_parse_header(const uint8_t *buf, int len,
      * [2] http://www.wireshark.org/docs/dfref/r/rdt.html and
      *     http://anonsvn.wireshark.org/viewvc/trunk/epan/dissectors/packet-rdt.c
      */
-    if (sn)  *sn  = (buf[0]>>1) & 0x1f;
-    if (seq) *seq = AV_RB16(buf+1);
-    if (ts)  *ts  = AV_RB32(buf+4);
-    if (rn)  *rn  = buf[3] & 0x3f;
-
-    return consumed;
+    init_get_bits(&gb, buf, len << 3);
+    len_included  = get_bits1(&gb);
+    need_reliable = get_bits1(&gb);
+    set_id        = get_bits(&gb, 5);
+    skip_bits(&gb, 1);
+    seq_no        = get_bits(&gb, 16);
+    if (len_included)
+        skip_bits(&gb, 16);
+    skip_bits(&gb, 2);
+    stream_id     = get_bits(&gb, 5);
+    is_keyframe   = !get_bits1(&gb);
+    timestamp     = get_bits_long(&gb, 32);
+    if (set_id == 0x1f)
+        set_id    = get_bits(&gb, 16);
+    if (need_reliable)
+        skip_bits(&gb, 16);
+    if (stream_id == 0x1f)
+        stream_id = get_bits(&gb, 16);
+
+    if (pset_id)      *pset_id      = set_id;
+    if (pseq_no)      *pseq_no      = seq_no;
+    if (pstream_id)   *pstream_id   = stream_id;
+    if (pis_keyframe) *pis_keyframe = is_keyframe;
+    if (ptimestamp)   *ptimestamp   = timestamp;
+
+    return consumed + (get_bits_count(&gb) >> 3);
 }
 
 /**< return 0 on packet, no more left, 1 on packet, 1 on partial packet... */
@@ -250,73 +300,83 @@ rdt_parse_packet (PayloadContext *rdt, AVStream *st,
                   const uint8_t *buf, int len, int flags)
 {
     int seq = 1, res;
-    ByteIOContext *pb = rdt->rmctx->pb;
-    RMContext *rm = rdt->rmctx->priv_data;
+    ByteIOContext pb;
 
-    if (rm->audio_pkt_cnt == 0) {
+    if (rdt->audio_pkt_cnt == 0) {
         int pos;
 
-        url_open_buf (&pb, buf, len, URL_RDONLY);
+        init_put_byte(&pb, buf, len, 0, NULL, NULL, NULL, NULL);
         flags = (flags & PKT_FLAG_KEY) ? 2 : 0;
-        rdt->rmctx->pb = pb;
-        res = ff_rm_parse_packet (rdt->rmctx, st, len, pkt,
+        res = ff_rm_parse_packet (rdt->rmctx, &pb, st, rdt->rmst[st->index], len, pkt,
                                   &seq, &flags, timestamp);
-        pos = url_ftell(pb);
-        url_close_buf (pb);
+        pos = url_ftell(&pb);
         if (res < 0)
             return res;
-        if (rm->audio_pkt_cnt > 0 &&
+        rdt->audio_pkt_cnt = res;
+        if (rdt->audio_pkt_cnt > 0 &&
             st->codec->codec_id == CODEC_ID_AAC) {
             memcpy (rdt->buffer, buf + pos, len - pos);
-            url_open_buf (&pb, rdt->buffer, len - pos, URL_RDONLY);
-            rdt->rmctx->pb = pb;
+            rdt->rmctx->pb = av_alloc_put_byte (rdt->buffer, len - pos, 0,
+                                                NULL, NULL, NULL, NULL);
         }
     } else {
-        ff_rm_retrieve_cache (rdt->rmctx, st, pkt);
-        if (rm->audio_pkt_cnt == 0 &&
+        rdt->audio_pkt_cnt =
+            ff_rm_retrieve_cache (rdt->rmctx, rdt->rmctx->pb,
+                                  st, rdt->rmst[st->index], pkt);
+        if (rdt->audio_pkt_cnt == 0 &&
             st->codec->codec_id == CODEC_ID_AAC)
-            url_close_buf (pb);
+            av_freep(&rdt->rmctx->pb);
     }
     pkt->stream_index = st->index;
     pkt->pts = *timestamp;
 
-    return rm->audio_pkt_cnt > 0;
+    return rdt->audio_pkt_cnt > 0;
 }
 
 int
 ff_rdt_parse_packet(RDTDemuxContext *s, AVPacket *pkt,
                     const uint8_t *buf, int len)
 {
-    int seq, flags = 0, rule, sn;
+    int seq_no, flags = 0, stream_id, set_id, is_keyframe;
     uint32_t timestamp;
     int rv= 0;
 
     if (!s->parse_packet)
         return -1;
 
-    if (!buf) {
+    if (!buf && s->prev_stream_id != -1) {
         /* return the next packets, if any */
         timestamp= 0; ///< Should not be used if buf is NULL, but should be set to the timestamp of the packet returned....
         rv= s->parse_packet(s->dynamic_protocol_context,
-                            s->st, pkt, &timestamp, NULL, 0, flags);
+                            s->streams[s->prev_stream_id],
+                            pkt, &timestamp, NULL, 0, flags);
         return rv;
     }
 
     if (len < 12)
         return -1;
-    rv = ff_rdt_parse_header(buf, len, &sn, &seq, &rule, &timestamp);
+    rv = ff_rdt_parse_header(buf, len, &set_id, &seq_no, &stream_id, &is_keyframe, &timestamp);
     if (rv < 0)
         return rv;
-    if (!(rule & 1) && (sn != s->prev_sn || timestamp != s->prev_ts)) {
+    if (is_keyframe &&
+        (set_id != s->prev_set_id || timestamp != s->prev_timestamp ||
+         stream_id != s->prev_stream_id)) {
         flags |= PKT_FLAG_KEY;
-        s->prev_sn = sn;
-        s->prev_ts = timestamp;
+        s->prev_set_id    = set_id;
+        s->prev_timestamp = timestamp;
     }
+    s->prev_stream_id = stream_id;
     buf += rv;
     len -= rv;
 
+     if (s->prev_stream_id >= s->n_streams) {
+         s->prev_stream_id = -1;
+         return -1;
+     }
+
     rv = s->parse_packet(s->dynamic_protocol_context,
-                         s->st, pkt, &timestamp, buf, len, flags);
+                         s->streams[s->prev_stream_id],
+                         pkt, &timestamp, buf, len, flags);
 
     return rv;
 }
@@ -329,15 +389,6 @@ ff_rdt_subscribe_rule (char *cmd, int size,
                 stream_nr, rule_nr * 2, stream_nr, rule_nr * 2 + 1);
 }
 
-void
-ff_rdt_subscribe_rule2 (RDTDemuxContext *s, char *cmd, int size,
-                        int stream_nr, int rule_nr)
-{
-    PayloadContext *rdt = s->dynamic_protocol_context;
-
-    rdt_load_mdpr(rdt, s->st, rule_nr * 2);
-}
-
 static unsigned char *
 rdt_parse_b64buf (unsigned int *target_len, const char *p)
 {
@@ -354,18 +405,109 @@ rdt_parse_b64buf (unsigned int *target_len, const char *p)
 }
 
 static int
-rdt_parse_sdp_line (AVStream *stream, PayloadContext *rdt, const char *line)
+rdt_parse_sdp_line (AVFormatContext *s, int st_index,
+                    PayloadContext *rdt, const char *line)
 {
+    AVStream *stream = s->streams[st_index];
     const char *p = line;
 
     if (av_strstart(p, "OpaqueData:buffer;", &p)) {
         rdt->mlti_data = rdt_parse_b64buf(&rdt->mlti_data_size, p);
     } else if (av_strstart(p, "StartTime:integer;", &p))
         stream->first_dts = atoi(p);
+    else if (av_strstart(p, "ASMRuleBook:string;", &p)) {
+        int n = st_index, first = -1;
+
+        for (n = 0; n < s->nb_streams; n++)
+            if (s->streams[n]->priv_data == stream->priv_data) {
+                if (first == -1) first = n;
+                rdt->rmst[s->streams[n]->index] = ff_rm_alloc_rmstream();
+                rdt_load_mdpr(rdt, s->streams[n], (n - first) * 2);
+
+                if (s->streams[n]->codec->codec_id == CODEC_ID_AAC)
+                    s->streams[n]->codec->frame_size = 1; // FIXME
+           }
+    }
 
     return 0;
 }
 
+static void
+real_parse_asm_rule(AVStream *st, const char *p, const char *end)
+{
+    do {
+        /* can be either averagebandwidth= or AverageBandwidth= */
+        if (sscanf(p, " %*1[Aa]verage%*1[Bb]andwidth=%d", &st->codec->bit_rate) == 1)
+            break;
+        if (!(p = strchr(p, ',')) || p > end)
+            p = end;
+        p++;
+    } while (p < end);
+}
+
+static AVStream *
+add_dstream(AVFormatContext *s, AVStream *orig_st)
+{
+    AVStream *st;
+
+    if (!(st = av_new_stream(s, 0)))
+        return NULL;
+    st->codec->codec_type = orig_st->codec->codec_type;
+    st->priv_data         = orig_st->priv_data;
+    st->first_dts         = orig_st->first_dts;
+
+    return st;
+}
+
+static void
+real_parse_asm_rulebook(AVFormatContext *s, AVStream *orig_st,
+                        const char *p)
+{
+    const char *end;
+    int n_rules, odd = 0;
+    AVStream *st;
+
+    /**
+     * The ASMRuleBook contains a list of comma-separated strings per rule,
+     * and each rule is separated by a ;. The last one also has a ; at the
+     * end so we can use it as delimiter.
+     * Every rule occurs twice, once for when the RTSP packet header marker
+     * is set and once for if it isn't. We only read the first because we
+     * don't care much (that's what the "odd" variable is for).
+     * Each rule contains a set of one or more statements, optionally
+     * preceeded by a single condition. If there's a condition, the rule
+     * starts with a '#'. Multiple conditions are merged between brackets,
+     * so there are never multiple conditions spread out over separate
+     * statements. Generally, these conditions are bitrate limits (min/max)
+     * for multi-bitrate streams.
+     */
+    if (*p == '\"') p++;
+    for (n_rules = 0; s->nb_streams < MAX_STREAMS;) {
+        if (!(end = strchr(p, ';')))
+            break;
+        if (!odd && end != p) {
+            if (n_rules > 0)
+                st = add_dstream(s, orig_st);
+            else
+                st = orig_st;
+            real_parse_asm_rule(st, p, end);
+            n_rules++;
+        }
+        p = end + 1;
+        odd ^= 1;
+    }
+}
+
+void
+ff_real_parse_sdp_a_line (AVFormatContext *s, int stream_index,
+                          const char *line)
+{
+    const char *p = line;
+
+    if (av_strstart(p, "ASMRuleBook:string;", &p))
+        real_parse_asm_rulebook(s, s->streams[stream_index], p);
+}
+
 static PayloadContext *
 rdt_new_extradata (void)
 {
@@ -379,6 +521,13 @@ rdt_new_extradata (void)
 static void
 rdt_free_extradata (PayloadContext *rdt)
 {
+    int i;
+
+    for (i = 0; i < MAX_STREAMS; i++)
+        if (rdt->rmst[i]) {
+            ff_rm_free_rmstream(rdt->rmst[i]);
+            av_freep(&rdt->rmst[i]);
+        }
     if (rdt->rmctx)
         av_close_input_stream(rdt->rmctx);
     av_freep(&rdt->mlti_data);
diff --git a/libavformat/rdt.h b/libavformat/rdt.h
index aa6cbaa..e24a0d5 100644
--- a/libavformat/rdt.h
+++ b/libavformat/rdt.h
@@ -28,7 +28,18 @@
 
 typedef struct RDTDemuxContext RDTDemuxContext;
 
-RDTDemuxContext *ff_rdt_parse_open(AVFormatContext *ic, AVStream *st,
+/**
+ * Allocate and init the RDT parsing context.
+ * @param ic the containing RTSP demuxer context
+ * @param first_stream_of_set_idx index to the first AVStream in the RTSP
+ *              demuxer context's ic->streams array that is part of this
+ *              particular stream's set of streams (with identical content)
+ * @param priv_data private data of the payload data handler context
+ * @param handler pointer to the parse_packet() payload parsing function
+ * @return a newly allocated RDTDemuxContext. Free with ff_rdt_parse_close().
+ */
+RDTDemuxContext *ff_rdt_parse_open(AVFormatContext *ic,
+                                   int first_stream_of_set_idx,
                                    void *priv_data,
                                    RTPDynamicProtocolHandler *handler);
 void ff_rdt_parse_close(RDTDemuxContext *s);
@@ -63,23 +74,22 @@ void av_register_rdt_dynamic_payload_handlers(void);
  */
 void ff_rdt_subscribe_rule(char *cmd, int size,
                            int stream_nr, int rule_nr);
-// FIXME this will be removed ASAP
-void ff_rdt_subscribe_rule2(RDTDemuxContext *s, char *cmd, int size,
-                            int stream_nr, int rule_nr);
 
 /**
  * Parse RDT-style packet header.
  *
  * @param buf input buffer
  * @param len length of input buffer
- * @param sn will be set to the stream number this packet belongs to
- * @param seq will be set to the sequence number this packet belongs to
- * @param rn will be set to the rule number this packet belongs to
- * @param ts will be set to the timestamp of the packet
+ * @param set_id will be set to the set ID this packet belongs to
+ * @param seq_no will be set to the sequence number of the packet
+ * @param stream_id will be set to the stream ID this packet belongs to
+ * @param is_keyframe will be whether this packet belongs to a keyframe
+ * @param timestamp will be set to the timestamp of the packet
  * @return the amount of bytes consumed, or <0 on error
  */
 int ff_rdt_parse_header(const uint8_t *buf, int len,
-                        int *sn, int *seq, int *rn, uint32_t *ts);
+                        int *set_id, int *seq_no, int *stream_id,
+                        int *is_keyframe, uint32_t *timestamp);
 
 /**
  * Parse RDT-style packet data (header + media data).
@@ -88,4 +98,15 @@ int ff_rdt_parse_header(const uint8_t *buf, int len,
 int ff_rdt_parse_packet(RDTDemuxContext *s, AVPacket *pkt,
                         const uint8_t *buf, int len);
 
+/**
+ * Parse a server-related SDP line.
+ *
+ * @param s the RTSP AVFormatContext
+ * @param stream_index the index of the first stream in the set represented
+ *               by the SDP m= line (in s->streams)
+ * @param buf the SDP line
+ */
+void ff_real_parse_sdp_a_line(AVFormatContext *s, int stream_index,
+                              const char *buf);
+
 #endif /* AVFORMAT_RDT_H */
diff --git a/libavformat/riff.c b/libavformat/riff.c
index ae6b7ba..0f9b177 100644
--- a/libavformat/riff.c
+++ b/libavformat/riff.c
@@ -33,6 +33,9 @@ const AVCodecTag codec_bmp_tags[] = {
     { CODEC_ID_H264,         MKTAG('a', 'v', 'c', '1') },
     { CODEC_ID_H264,         MKTAG('V', 'S', 'S', 'H') },
     { CODEC_ID_H263,         MKTAG('H', '2', '6', '3') },
+    { CODEC_ID_H263,         MKTAG('X', '2', '6', '3') },
+    { CODEC_ID_H263,         MKTAG('L', '2', '6', '3') },
+    { CODEC_ID_H263,         MKTAG('V', 'X', '1', 'K') },
     { CODEC_ID_H263P,        MKTAG('H', '2', '6', '3') },
     { CODEC_ID_H263I,        MKTAG('I', '2', '6', '3') }, /* intel h263 */
     { CODEC_ID_H261,         MKTAG('H', '2', '6', '1') },
@@ -53,18 +56,34 @@ const AVCodecTag codec_bmp_tags[] = {
     { CODEC_ID_MPEG4,        MKTAG('S', 'E', 'D', 'G') },
     { CODEC_ID_MPEG4,        MKTAG('R', 'M', 'P', '4') },
     { CODEC_ID_MPEG4,        MKTAG('3', 'I', 'V', '2') },
+    { CODEC_ID_MPEG4,        MKTAG('F', 'F', 'D', 'S') },
+    { CODEC_ID_MPEG4,        MKTAG('F', 'V', 'F', 'W') },
+    { CODEC_ID_MPEG4,        MKTAG('D', 'C', 'O', 'D') },
+    { CODEC_ID_MPEG4,        MKTAG('M', 'V', 'X', 'M') },
+    { CODEC_ID_MPEG4,        MKTAG('P', 'M', '4', 'V') },
+    { CODEC_ID_MPEG4,        MKTAG('S', 'M', 'P', '4') },
+    { CODEC_ID_MPEG4,        MKTAG('D', 'X', 'G', 'M') },
+    { CODEC_ID_MPEG4,        MKTAG('V', 'I', 'D', 'M') },
+    { CODEC_ID_MPEG4,        MKTAG('M', '4', 'T', '3') },
+    { CODEC_ID_MPEG4,        MKTAG('G', 'E', 'O', 'X') },
+    { CODEC_ID_MPEG4,        MKTAG('H', 'D', 'X', '4') }, /* flipped video */
+    { CODEC_ID_MPEG4,        MKTAG('D', 'M', 'K', '2') },
+    { CODEC_ID_MPEG4,        MKTAG('D', 'I', 'G', 'I') },
+    { CODEC_ID_MPEG4,        MKTAG('I', 'N', 'M', 'C') },
     { CODEC_ID_MSMPEG4V3,    MKTAG('D', 'I', 'V', '3') }, /* default signature when using MSMPEG4 */
     { CODEC_ID_MSMPEG4V3,    MKTAG('M', 'P', '4', '3') },
     { CODEC_ID_MSMPEG4V3,    MKTAG('M', 'P', 'G', '3') },
     { CODEC_ID_MSMPEG4V3,    MKTAG('D', 'I', 'V', '5') },
     { CODEC_ID_MSMPEG4V3,    MKTAG('D', 'I', 'V', '6') },
     { CODEC_ID_MSMPEG4V3,    MKTAG('D', 'I', 'V', '4') },
+    { CODEC_ID_MSMPEG4V3,    MKTAG('D', 'V', 'X', '3') },
     { CODEC_ID_MSMPEG4V3,    MKTAG('A', 'P', '4', '1') },
     { CODEC_ID_MSMPEG4V3,    MKTAG('C', 'O', 'L', '1') },
     { CODEC_ID_MSMPEG4V3,    MKTAG('C', 'O', 'L', '0') },
     { CODEC_ID_MSMPEG4V2,    MKTAG('M', 'P', '4', '2') },
     { CODEC_ID_MSMPEG4V2,    MKTAG('D', 'I', 'V', '2') },
     { CODEC_ID_MSMPEG4V1,    MKTAG('M', 'P', 'G', '4') },
+    { CODEC_ID_MSMPEG4V1,    MKTAG('M', 'P', '4', '1') },
     { CODEC_ID_WMV1,         MKTAG('W', 'M', 'V', '1') },
     { CODEC_ID_WMV2,         MKTAG('W', 'M', 'V', '2') },
     { CODEC_ID_DVVIDEO,      MKTAG('d', 'v', 's', 'd') },
@@ -73,18 +92,22 @@ const AVCodecTag codec_bmp_tags[] = {
     { CODEC_ID_DVVIDEO,      MKTAG('d', 'v', '2', '5') },
     { CODEC_ID_DVVIDEO,      MKTAG('d', 'v', '5', '0') },
     { CODEC_ID_DVVIDEO,      MKTAG('c', 'd', 'v', 'c') }, /* Canopus DV */
+    { CODEC_ID_DVVIDEO,      MKTAG('d', 'v', 'c', ' ') },
     { CODEC_ID_MPEG1VIDEO,   MKTAG('m', 'p', 'g', '1') },
     { CODEC_ID_MPEG1VIDEO,   MKTAG('m', 'p', 'g', '2') },
     { CODEC_ID_MPEG2VIDEO,   MKTAG('m', 'p', 'g', '2') },
     { CODEC_ID_MPEG2VIDEO,   MKTAG('M', 'P', 'E', 'G') },
     { CODEC_ID_MPEG1VIDEO,   MKTAG('P', 'I', 'M', '1') },
+    { CODEC_ID_MPEG2VIDEO,   MKTAG('P', 'I', 'M', '2') },
     { CODEC_ID_MPEG1VIDEO,   MKTAG('V', 'C', 'R', '2') },
     { CODEC_ID_MPEG1VIDEO,   MKTAG( 1 ,  0 ,  0 ,  16) },
     { CODEC_ID_MPEG2VIDEO,   MKTAG( 2 ,  0 ,  0 ,  16) },
     { CODEC_ID_MPEG2VIDEO,   MKTAG('D', 'V', 'R', ' ') },
     { CODEC_ID_MPEG2VIDEO,   MKTAG('M', 'M', 'E', 'S') },
+    { CODEC_ID_MPEG2VIDEO,   MKTAG('L', 'M', 'P', '2') }, /* Lead MPEG2 in avi */
     { CODEC_ID_MJPEG,        MKTAG('M', 'J', 'P', 'G') },
     { CODEC_ID_MJPEG,        MKTAG('L', 'J', 'P', 'G') },
+    { CODEC_ID_MJPEG,        MKTAG('d', 'm', 'b', '1') },
     { CODEC_ID_LJPEG,        MKTAG('L', 'J', 'P', 'G') },
     { CODEC_ID_MJPEG,        MKTAG('J', 'P', 'G', 'L') }, /* Pegasus lossless JPEG */
     { CODEC_ID_JPEGLS,       MKTAG('M', 'J', 'L', 'S') }, /* JPEG-LS custom FOURCC for avi - encoder */
@@ -92,6 +115,9 @@ const AVCodecTag codec_bmp_tags[] = {
     { CODEC_ID_MJPEG,        MKTAG('j', 'p', 'e', 'g') },
     { CODEC_ID_MJPEG,        MKTAG('I', 'J', 'P', 'G') },
     { CODEC_ID_MJPEG,        MKTAG('A', 'V', 'R', 'n') },
+    { CODEC_ID_MJPEG,        MKTAG('A', 'C', 'D', 'V') },
+    { CODEC_ID_MJPEG,        MKTAG('Q', 'I', 'V', 'G') },
+    { CODEC_ID_MJPEG,        MKTAG('C', 'J', 'P', 'G') }, /* Creative Webcam JPEG */
     { CODEC_ID_HUFFYUV,      MKTAG('H', 'F', 'Y', 'U') },
     { CODEC_ID_FFVHUFF,      MKTAG('F', 'F', 'V', 'H') },
     { CODEC_ID_CYUV,         MKTAG('C', 'Y', 'U', 'V') },
@@ -161,8 +187,12 @@ const AVCodecTag codec_bmp_tags[] = {
     { CODEC_ID_JPEG2000,     MKTAG('M', 'J', '2', 'C') },
     { CODEC_ID_VMNC,         MKTAG('V', 'M', 'n', 'c') },
     { CODEC_ID_TARGA,        MKTAG('t', 'g', 'a', ' ') },
+    { CODEC_ID_PNG,          MKTAG('M', 'P', 'N', 'G') },
     { CODEC_ID_CLJR,         MKTAG('c', 'l', 'j', 'r') },
     { CODEC_ID_DIRAC,        MKTAG('d', 'r', 'a', 'c') },
+    { CODEC_ID_RPZA,         MKTAG('a', 'z', 'p', 'r') },
+    { CODEC_ID_RPZA,         MKTAG('R', 'P', 'Z', 'A') },
+    { CODEC_ID_RPZA,         MKTAG('r', 'p', 'z', 'a') },
     { CODEC_ID_NONE,         0 }
 };
 
@@ -185,6 +215,8 @@ const AVCodecTag codec_wav_tags[] = {
     { CODEC_ID_ADPCM_G726,      0x0045 },
     { CODEC_ID_MP2,             0x0050 },
     { CODEC_ID_MP3,             0x0055 },
+    { CODEC_ID_AMR_NB,          0x0057 },
+    { CODEC_ID_AMR_WB,          0x0058 },
     { CODEC_ID_ADPCM_IMA_DK4,   0x0061 },  /* rogue format number */
     { CODEC_ID_ADPCM_IMA_DK3,   0x0062 },  /* rogue format number */
     { CODEC_ID_VOXWARE,         0x0075 },
@@ -202,6 +234,7 @@ const AVCodecTag codec_wav_tags[] = {
     { CODEC_ID_SONIC,           0x2048 },
     { CODEC_ID_SONIC_LS,        0x2048 },
     { CODEC_ID_AAC,             0x706d },
+    { CODEC_ID_AAC,             0x4143 },
     { CODEC_ID_FLAC,            0xF1AC },
     { CODEC_ID_ADPCM_SWF,       ('S'<<8)+'F' },
     { CODEC_ID_VORBIS,          ('V'<<8)+'o' }, //HACK/FIXME, does vorbis in WAV/AVI have an (in)official id?
@@ -395,9 +428,9 @@ void get_wav_header(ByteIOContext *pb, AVCodecContext *codec, int size)
 }
 
 
-int wav_codec_get_id(unsigned int tag, int bps)
+enum CodecID wav_codec_get_id(unsigned int tag, int bps)
 {
-    int id;
+    enum CodecID id;
     id = codec_get_id(codec_wav_tags, tag);
     if (id <= 0)
         return id;
diff --git a/libavformat/riff.h b/libavformat/riff.h
index bd4e9f1..93c1d71 100644
--- a/libavformat/riff.h
+++ b/libavformat/riff.h
@@ -41,7 +41,7 @@ typedef struct AVCodecTag {
 
 void put_bmp_header(ByteIOContext *pb, AVCodecContext *enc, const AVCodecTag *tags, int for_asf);
 int put_wav_header(ByteIOContext *pb, AVCodecContext *enc);
-int wav_codec_get_id(unsigned int tag, int bps);
+enum CodecID wav_codec_get_id(unsigned int tag, int bps);
 void get_wav_header(ByteIOContext *pb, AVCodecContext *codec, int size);
 
 extern const AVCodecTag codec_bmp_tags[];
diff --git a/libavformat/rm.h b/libavformat/rm.h
index 4ad1c30..2f45c0f 100644
--- a/libavformat/rm.h
+++ b/libavformat/rm.h
@@ -24,44 +24,10 @@
 
 #include "avformat.h"
 
+typedef struct RMStream RMStream;
 
-typedef struct {
-    int nb_packets;
-    int packet_total_size;
-    int packet_max_size;
-    /* codec related output */
-    int bit_rate;
-    float frame_rate;
-    int nb_frames;    /* current frame number */
-    int total_frames; /* total number of frames */
-    int num;
-    AVCodecContext *enc;
-} StreamInfo;
-
-typedef struct {
-    StreamInfo streams[2];
-    StreamInfo *audio_stream, *video_stream;
-    int data_pos; /* position of the data after the header */
-    int nb_packets;
-    int old_format;
-    int current_stream;
-    int remaining_len;
-    uint8_t *videobuf; ///< place to store merged video frame
-    int videobufsize;  ///< current assembled frame size
-    int videobufpos;   ///< position for the next slice in the video buffer
-    int curpic_num;    ///< picture number of current frame
-    int cur_slice, slices;
-    int64_t pktpos;    ///< first slice position in file
-    /// Audio descrambling matrix parameters
-    uint8_t *audiobuf; ///< place to store reordered audio data
-    int64_t audiotimestamp; ///< Audio packet timestamp
-    int sub_packet_cnt; // Subpacket counter, used while reading
-    int sub_packet_size, sub_packet_h, coded_framesize; ///< Descrambling parameters from container
-    int audio_stream_num; ///< Stream number for audio packets
-    int audio_pkt_cnt; ///< Output packet counter
-    int audio_framesize; /// Audio frame size from container
-    int sub_packet_lengths[16]; /// Length of each aac subpacket
-} RMContext;
+RMStream *ff_rm_alloc_rmstream (void);
+void      ff_rm_free_rmstream  (RMStream *rms);
 
 /*< input format for Realmedia-style RTSP streams */
 extern AVInputFormat rdt_demuxer;
@@ -71,18 +37,24 @@ extern AVInputFormat rdt_demuxer;
  * parameters.
  *
  * @param s context containing RMContext and ByteIOContext for stream reading
+ * @param pb context to read the data from
  * @param st the stream that the MDPR chunk belongs to and where to store the
  *           parameters read from the chunk into
+ * @param rst real-specific stream information
  * @param codec_data_size size of the MDPR chunk
  * @return 0 on success, errno codes on error
  */
-int ff_rm_read_mdpr_codecdata (AVFormatContext *s, AVStream *st, int codec_data_size);
+int ff_rm_read_mdpr_codecdata (AVFormatContext *s, ByteIOContext *pb,
+                               AVStream *st, RMStream *rst,
+                               int codec_data_size);
 
 /**
  * Parse one rm-stream packet from the input bytestream.
  *
  * @param s context containing RMContext and ByteIOContext for stream reading
+ * @param pb context to read the data from
  * @param st stream to which the packet to be read belongs
+ * @param rst Real-specific stream information
  * @param len packet length to read from the input
  * @param pkt packet location to store the parsed packet data
  * @param seq pointer to an integer containing the sequence number, may be
@@ -90,9 +62,12 @@ int ff_rm_read_mdpr_codecdata (AVFormatContext *s, AVStream *st, int codec_data_
  * @param flags pointer to an integer containing the packet flags, may be
                 updated
  * @param ts pointer to timestamp, may be updated
- * @return 0 on success, errno codes on error
+ * @return >=0 on success (where >0 indicates there are cached samples that
+ *         can be retrieved with subsequent calls to ff_rm_retrieve_cache()),
+ *         errno codes on error
  */
-int ff_rm_parse_packet (AVFormatContext *s, AVStream *st, int len,
+int ff_rm_parse_packet (AVFormatContext *s, ByteIOContext *pb,
+                        AVStream *st, RMStream *rst, int len,
                         AVPacket *pkt, int *seq, int *flags, int64_t *ts);
 
 /**
@@ -104,9 +79,14 @@ int ff_rm_parse_packet (AVFormatContext *s, AVStream *st, int len,
  * of those packets can be retrieved sequentially.
  *
  * @param s context containing RMContext and ByteIOContext for stream reading
+ * @param pb context to read the data from
  * @param st stream that this packet belongs to
+ * @param rst Real-specific stream information
  * @param pkt location to store the packet data
+ * @returns the number of samples left for subsequent calls to this same
+ *          function, or 0 if all samples have been retrieved.
  */
-void ff_rm_retrieve_cache (AVFormatContext *s, AVStream *st, AVPacket *pkt);
+int ff_rm_retrieve_cache (AVFormatContext *s, ByteIOContext *pb,
+                          AVStream *st, RMStream *rst, AVPacket *pkt);
 
 #endif /* AVFORMAT_RM_H */
diff --git a/libavformat/rmdec.c b/libavformat/rmdec.c
index 9db09e1..fc066df 100644
--- a/libavformat/rmdec.c
+++ b/libavformat/rmdec.c
@@ -23,6 +23,30 @@
 #include "avformat.h"
 #include "rm.h"
 
+struct RMStream {
+    AVPacket pkt;      ///< place to store merged video frame / reordered audio data
+    int videobufsize;  ///< current assembled frame size
+    int videobufpos;   ///< position for the next slice in the video buffer
+    int curpic_num;    ///< picture number of current frame
+    int cur_slice, slices;
+    int64_t pktpos;    ///< first slice position in file
+    /// Audio descrambling matrix parameters
+    int64_t audiotimestamp; ///< Audio packet timestamp
+    int sub_packet_cnt; // Subpacket counter, used while reading
+    int sub_packet_size, sub_packet_h, coded_framesize; ///< Descrambling parameters from container
+    int audio_framesize; /// Audio frame size from container
+    int sub_packet_lengths[16]; /// Length of each subpacket
+};
+
+typedef struct {
+    int nb_packets;
+    int old_format;
+    int current_stream;
+    int remaining_len;
+    int audio_stream_num; ///< Stream number for audio packets
+    int audio_pkt_cnt; ///< Output packet counter
+} RMDemuxContext;
+
 static inline void get_strl(ByteIOContext *pb, char *buf, int buf_size, int len)
 {
     int i;
@@ -47,30 +71,37 @@ static void get_str8(ByteIOContext *pb, char *buf, int buf_size)
     get_strl(pb, buf, buf_size, get_byte(pb));
 }
 
-static int rm_read_audio_stream_info(AVFormatContext *s, AVStream *st,
-                                      int read_all)
+RMStream *ff_rm_alloc_rmstream (void)
+{
+    RMStream *rms = av_mallocz(sizeof(RMStream));
+    rms->curpic_num = -1;
+    return rms;
+}
+
+void ff_rm_free_rmstream (RMStream *rms)
+{
+    av_free_packet(&rms->pkt);
+}
+
+static int rm_read_audio_stream_info(AVFormatContext *s, ByteIOContext *pb,
+                                     AVStream *st, RMStream *ast, int read_all)
 {
-    RMContext *rm = s->priv_data;
-    ByteIOContext *pb = s->pb;
     char buf[256];
     uint32_t version;
-    int i;
 
     /* ra type header */
     version = get_be32(pb); /* version */
     if (((version >> 16) & 0xff) == 3) {
         int64_t startpos = url_ftell(pb);
-        /* very old version */
-        for(i = 0; i < 14; i++)
-            get_byte(pb);
+        url_fskip(pb, 14);
         get_str8(pb, s->title, sizeof(s->title));
         get_str8(pb, s->author, sizeof(s->author));
         get_str8(pb, s->copyright, sizeof(s->copyright));
         get_str8(pb, s->comment, sizeof(s->comment));
         if ((startpos + (version & 0xffff)) >= url_ftell(pb) + 2) {
-        // fourcc (should always be "lpcJ")
-        get_byte(pb);
-        get_str8(pb, buf, sizeof(buf));
+            // fourcc (should always be "lpcJ")
+            get_byte(pb);
+            get_str8(pb, buf, sizeof(buf));
         }
         // Skip extra header crap (this should never happen)
         if ((startpos + (version & 0xffff)) > url_ftell(pb))
@@ -87,25 +118,23 @@ static int rm_read_audio_stream_info(AVFormatContext *s, AVStream *st,
         get_be16(pb); /* version2 */
         get_be32(pb); /* header size */
         flavor= get_be16(pb); /* add codec info / flavor */
-        rm->coded_framesize = coded_framesize = get_be32(pb); /* coded frame size */
+        ast->coded_framesize = coded_framesize = get_be32(pb); /* coded frame size */
         get_be32(pb); /* ??? */
         get_be32(pb); /* ??? */
         get_be32(pb); /* ??? */
-        rm->sub_packet_h = sub_packet_h = get_be16(pb); /* 1 */
+        ast->sub_packet_h = sub_packet_h = get_be16(pb); /* 1 */
         st->codec->block_align= get_be16(pb); /* frame size */
-        rm->sub_packet_size = sub_packet_size = get_be16(pb); /* sub packet size */
+        ast->sub_packet_size = sub_packet_size = get_be16(pb); /* sub packet size */
         get_be16(pb); /* ??? */
         if (((version >> 16) & 0xff) == 5) {
-            get_be16(pb); get_be16(pb); get_be16(pb); }
+            get_be16(pb); get_be16(pb); get_be16(pb);
+        }
         st->codec->sample_rate = get_be16(pb);
         get_be32(pb);
         st->codec->channels = get_be16(pb);
         if (((version >> 16) & 0xff) == 5) {
             get_be32(pb);
-            buf[0] = get_byte(pb);
-            buf[1] = get_byte(pb);
-            buf[2] = get_byte(pb);
-            buf[3] = get_byte(pb);
+            get_buffer(pb, buf, 4);
             buf[4] = 0;
         } else {
             get_str8(pb, buf, sizeof(buf)); /* desc */
@@ -118,17 +147,17 @@ static int rm_read_audio_stream_info(AVFormatContext *s, AVStream *st,
         } else if (!strcmp(buf, "28_8")) {
             st->codec->codec_id = CODEC_ID_RA_288;
             st->codec->extradata_size= 0;
-            rm->audio_framesize = st->codec->block_align;
+            ast->audio_framesize = st->codec->block_align;
             st->codec->block_align = coded_framesize;
 
-            if(rm->audio_framesize >= UINT_MAX / sub_packet_h){
-                av_log(s, AV_LOG_ERROR, "rm->audio_framesize * sub_packet_h too large\n");
+            if(ast->audio_framesize >= UINT_MAX / sub_packet_h){
+                av_log(s, AV_LOG_ERROR, "ast->audio_framesize * sub_packet_h too large\n");
                 return -1;
             }
 
-            rm->audiobuf = av_malloc(rm->audio_framesize * sub_packet_h);
+            av_new_packet(&ast->pkt, ast->audio_framesize * sub_packet_h);
         } else if ((!strcmp(buf, "cook")) || (!strcmp(buf, "atrc")) || (!strcmp(buf, "sipr"))) {
-            int codecdata_length, i;
+            int codecdata_length;
             get_be16(pb); get_byte(pb);
             if (((version >> 16) & 0xff) == 5)
                 get_byte(pb);
@@ -148,19 +177,18 @@ static int rm_read_audio_stream_info(AVFormatContext *s, AVStream *st,
             else st->codec->codec_id = CODEC_ID_ATRAC3;
             st->codec->extradata_size= codecdata_length;
             st->codec->extradata= av_mallocz(st->codec->extradata_size + FF_INPUT_BUFFER_PADDING_SIZE);
-            for(i = 0; i < codecdata_length; i++)
-                ((uint8_t*)st->codec->extradata)[i] = get_byte(pb);
-            rm->audio_framesize = st->codec->block_align;
-            st->codec->block_align = rm->sub_packet_size;
+            get_buffer(pb, st->codec->extradata, st->codec->extradata_size);
+            ast->audio_framesize = st->codec->block_align;
+            st->codec->block_align = ast->sub_packet_size;
 
-            if(rm->audio_framesize >= UINT_MAX / sub_packet_h){
+            if(ast->audio_framesize >= UINT_MAX / sub_packet_h){
                 av_log(s, AV_LOG_ERROR, "rm->audio_framesize * sub_packet_h too large\n");
                 return -1;
             }
 
-            rm->audiobuf = av_malloc(rm->audio_framesize * sub_packet_h);
+            av_new_packet(&ast->pkt, ast->audio_framesize * sub_packet_h);
         } else if (!strcmp(buf, "raac") || !strcmp(buf, "racp")) {
-            int codecdata_length, i;
+            int codecdata_length;
             get_be16(pb); get_byte(pb);
             if (((version >> 16) & 0xff) == 5)
                 get_byte(pb);
@@ -174,8 +202,7 @@ static int rm_read_audio_stream_info(AVFormatContext *s, AVStream *st,
                 st->codec->extradata_size = codecdata_length - 1;
                 st->codec->extradata = av_mallocz(st->codec->extradata_size + FF_INPUT_BUFFER_PADDING_SIZE);
                 get_byte(pb);
-                for(i = 0; i < st->codec->extradata_size; i++)
-                    ((uint8_t*)st->codec->extradata)[i] = get_byte(pb);
+                get_buffer(pb, st->codec->extradata, st->codec->extradata_size);
             }
         } else {
             st->codec->codec_id = CODEC_ID_NONE;
@@ -196,9 +223,9 @@ static int rm_read_audio_stream_info(AVFormatContext *s, AVStream *st,
 }
 
 int
-ff_rm_read_mdpr_codecdata (AVFormatContext *s, AVStream *st, int codec_data_size)
+ff_rm_read_mdpr_codecdata (AVFormatContext *s, ByteIOContext *pb,
+                           AVStream *st, RMStream *rst, int codec_data_size)
 {
-    ByteIOContext *pb = s->pb;
     unsigned int v;
     int size;
     int64_t codec_pos;
@@ -208,7 +235,7 @@ ff_rm_read_mdpr_codecdata (AVFormatContext *s, AVStream *st, int codec_data_size
     v = get_be32(pb);
     if (v == MKTAG(0xfd, 'a', 'r', '.')) {
         /* ra type header */
-        if (rm_read_audio_stream_info(s, st, 0))
+        if (rm_read_audio_stream_info(s, pb, st, rst, 0))
             return -1;
     } else {
         int fps, fps2;
@@ -268,19 +295,20 @@ skip:
 
 static int rm_read_header_old(AVFormatContext *s, AVFormatParameters *ap)
 {
-    RMContext *rm = s->priv_data;
+    RMDemuxContext *rm = s->priv_data;
     AVStream *st;
 
     rm->old_format = 1;
     st = av_new_stream(s, 0);
     if (!st)
         return -1;
-    return rm_read_audio_stream_info(s, st, 1);
+    st->priv_data = ff_rm_alloc_rmstream();
+    return rm_read_audio_stream_info(s, s->pb, st, st->priv_data, 1);
 }
 
 static int rm_read_header(AVFormatContext *s, AVFormatParameters *ap)
 {
-    RMContext *rm = s->priv_data;
+    RMDemuxContext *rm = s->priv_data;
     AVStream *st;
     ByteIOContext *pb = s->pb;
     unsigned int tag;
@@ -357,7 +385,9 @@ static int rm_read_header(AVFormatContext *s, AVFormatParameters *ap)
             get_str8(pb, buf, sizeof(buf)); /* desc */
             get_str8(pb, buf, sizeof(buf)); /* mimetype */
             st->codec->codec_type = CODEC_TYPE_DATA;
-            if (ff_rm_read_mdpr_codecdata(s, st, get_be32(pb)) < 0)
+            st->priv_data = ff_rm_alloc_rmstream();
+            if (ff_rm_read_mdpr_codecdata(s, s->pb, st, st->priv_data,
+                                          get_be32(pb)) < 0)
                 return -1;
             break;
         case MKTAG('D', 'A', 'T', 'A'):
@@ -373,7 +403,6 @@ static int rm_read_header(AVFormatContext *s, AVFormatParameters *ap)
     if (!rm->nb_packets && (flags & 4))
         rm->nb_packets = 3600 * 25;
     get_be32(pb); /* next data header */
-    rm->curpic_num = -1;
     return 0;
 }
 
@@ -397,7 +426,7 @@ static int get_num(ByteIOContext *pb, int *len)
 #define RAW_PACKET_SIZE 1000
 
 static int sync(AVFormatContext *s, int64_t *timestamp, int *flags, int *stream_index, int64_t *pos){
-    RMContext *rm = s->priv_data;
+    RMDemuxContext *rm = s->priv_data;
     ByteIOContext *pb = s->pb;
     int len, num, res, i;
     AVStream *st;
@@ -452,83 +481,80 @@ skip:
     return -1;
 }
 
-static int rm_assemble_video_frame(AVFormatContext *s, RMContext *rm, AVPacket *pkt, int len)
+static int rm_assemble_video_frame(AVFormatContext *s, ByteIOContext *pb,
+                                   RMDemuxContext *rm, RMStream *vst,
+                                   AVPacket *pkt, int len)
 {
-    ByteIOContext *pb = s->pb;
     int hdr, seq, pic_num, len2, pos;
     int type;
 
     hdr = get_byte(pb); len--;
     type = hdr >> 6;
-    switch(type){
-    case 0: // slice
-    case 2: // last slice
+
+    if(type != 3){  // not frame as a part of packet
         seq = get_byte(pb); len--;
+    }
+    if(type != 1){  // not whole frame
         len2 = get_num(pb, &len);
-        pos = get_num(pb, &len);
+        pos  = get_num(pb, &len);
         pic_num = get_byte(pb); len--;
-        rm->remaining_len = len;
-        break;
-    case 1: //whole frame
-        seq = get_byte(pb); len--;
+    }
+    if(len<0)
+        return -1;
+    rm->remaining_len = len;
+    if(type&1){     // frame, not slice
+        if(type == 3)  // frame as a part of packet
+            len= len2;
+        if(rm->remaining_len < len)
+            return -1;
+        rm->remaining_len -= len;
         if(av_new_packet(pkt, len + 9) < 0)
             return AVERROR(EIO);
         pkt->data[0] = 0;
         AV_WL32(pkt->data + 1, 1);
         AV_WL32(pkt->data + 5, 0);
         get_buffer(pb, pkt->data + 9, len);
-        rm->remaining_len = 0;
-        return 0;
-    case 3: //frame as a part of packet
-        len2 = get_num(pb, &len);
-        pos = get_num(pb, &len);
-        pic_num = get_byte(pb); len--;
-        rm->remaining_len = len - len2;
-        if(av_new_packet(pkt, len2 + 9) < 0)
-            return AVERROR(EIO);
-        pkt->data[0] = 0;
-        AV_WL32(pkt->data + 1, 1);
-        AV_WL32(pkt->data + 5, 0);
-        get_buffer(pb, pkt->data + 9, len2);
         return 0;
     }
     //now we have to deal with single slice
 
-    if((seq & 0x7F) == 1 || rm->curpic_num != pic_num){
-        rm->slices = ((hdr & 0x3F) << 1) + 1;
-        rm->videobufsize = len2 + 8*rm->slices + 1;
-        av_free(rm->videobuf);
-        if(!(rm->videobuf = av_malloc(rm->videobufsize)))
+    if((seq & 0x7F) == 1 || vst->curpic_num != pic_num){
+        vst->slices = ((hdr & 0x3F) << 1) + 1;
+        vst->videobufsize = len2 + 8*vst->slices + 1;
+        av_free_packet(&vst->pkt); //FIXME this should be output.
+        if(av_new_packet(&vst->pkt, vst->videobufsize) < 0)
             return AVERROR(ENOMEM);
-        rm->videobufpos = 8*rm->slices + 1;
-        rm->cur_slice = 0;
-        rm->curpic_num = pic_num;
-        rm->pktpos = url_ftell(pb);
+        vst->videobufpos = 8*vst->slices + 1;
+        vst->cur_slice = 0;
+        vst->curpic_num = pic_num;
+        vst->pktpos = url_ftell(pb);
     }
     if(type == 2)
         len = FFMIN(len, pos);
 
-    if(++rm->cur_slice > rm->slices)
+    if(++vst->cur_slice > vst->slices)
         return 1;
-    AV_WL32(rm->videobuf - 7 + 8*rm->cur_slice, 1);
-    AV_WL32(rm->videobuf - 3 + 8*rm->cur_slice, rm->videobufpos - 8*rm->slices - 1);
-    if(rm->videobufpos + len > rm->videobufsize)
+    AV_WL32(vst->pkt.data - 7 + 8*vst->cur_slice, 1);
+    AV_WL32(vst->pkt.data - 3 + 8*vst->cur_slice, vst->videobufpos - 8*vst->slices - 1);
+    if(vst->videobufpos + len > vst->videobufsize)
         return 1;
-    if (get_buffer(pb, rm->videobuf + rm->videobufpos, len) != len)
+    if (get_buffer(pb, vst->pkt.data + vst->videobufpos, len) != len)
         return AVERROR(EIO);
-    rm->videobufpos += len;
+    vst->videobufpos += len;
     rm->remaining_len-= len;
 
-    if(type == 2 || (rm->videobufpos) == rm->videobufsize){
-         rm->videobuf[0] = rm->cur_slice-1;
-         if(av_new_packet(pkt, rm->videobufpos - 8*(rm->slices - rm->cur_slice)) < 0)
-             return AVERROR(ENOMEM);
-         memcpy(pkt->data, rm->videobuf, 1 + 8*rm->cur_slice);
-         memcpy(pkt->data + 1 + 8*rm->cur_slice, rm->videobuf + 1 + 8*rm->slices,
-                rm->videobufpos - 1 - 8*rm->slices);
-         pkt->pts = AV_NOPTS_VALUE;
-         pkt->pos = rm->pktpos;
-         return 0;
+    if(type == 2 || (vst->videobufpos) == vst->videobufsize){
+        vst->pkt.data[0] = vst->cur_slice-1;
+        *pkt= vst->pkt;
+        vst->pkt.data= NULL;
+        vst->pkt.size= 0;
+        if(vst->slices != vst->cur_slice) //FIXME find out how to set slices correct from the begin
+            memmove(pkt->data + 1 + 8*vst->cur_slice, pkt->data + 1 + 8*vst->slices,
+                vst->videobufpos - 1 - 8*vst->slices);
+        pkt->size = vst->videobufpos + 8*(vst->cur_slice - vst->slices);
+        pkt->pts = AV_NOPTS_VALUE;
+        pkt->pos = vst->pktpos;
+        return 0;
     }
 
     return 1;
@@ -550,15 +576,15 @@ rm_ac3_swap_bytes (AVStream *st, AVPacket *pkt)
 }
 
 int
-ff_rm_parse_packet (AVFormatContext *s, AVStream *st, int len, AVPacket *pkt,
+ff_rm_parse_packet (AVFormatContext *s, ByteIOContext *pb,
+                    AVStream *st, RMStream *ast, int len, AVPacket *pkt,
                     int *seq, int *flags, int64_t *timestamp)
 {
-    ByteIOContext *pb = s->pb;
-    RMContext *rm = s->priv_data;
+    RMDemuxContext *rm = s->priv_data;
 
     if (st->codec->codec_type == CODEC_TYPE_VIDEO) {
         rm->current_stream= st->id;
-        if(rm_assemble_video_frame(s, rm, pkt, len) == 1)
+        if(rm_assemble_video_frame(s, pb, rm, ast, pkt, len))
             return -1; //got partial frame
     } else if (st->codec->codec_type == CODEC_TYPE_AUDIO) {
         if ((st->codec->codec_id == CODEC_ID_RA_288) ||
@@ -566,51 +592,51 @@ ff_rm_parse_packet (AVFormatContext *s, AVStream *st, int len, AVPacket *pkt,
             (st->codec->codec_id == CODEC_ID_ATRAC3) ||
             (st->codec->codec_id == CODEC_ID_SIPR)) {
             int x;
-            int sps = rm->sub_packet_size;
-            int cfs = rm->coded_framesize;
-            int h = rm->sub_packet_h;
-            int y = rm->sub_packet_cnt;
-            int w = rm->audio_framesize;
+            int sps = ast->sub_packet_size;
+            int cfs = ast->coded_framesize;
+            int h = ast->sub_packet_h;
+            int y = ast->sub_packet_cnt;
+            int w = ast->audio_framesize;
 
             if (*flags & 2)
-                y = rm->sub_packet_cnt = 0;
+                y = ast->sub_packet_cnt = 0;
             if (!y)
-                rm->audiotimestamp = *timestamp;
+                ast->audiotimestamp = *timestamp;
 
             switch(st->codec->codec_id) {
                 case CODEC_ID_RA_288:
                     for (x = 0; x < h/2; x++)
-                        get_buffer(pb, rm->audiobuf+x*2*w+y*cfs, cfs);
+                        get_buffer(pb, ast->pkt.data+x*2*w+y*cfs, cfs);
                     break;
                 case CODEC_ID_ATRAC3:
                 case CODEC_ID_COOK:
                     for (x = 0; x < w/sps; x++)
-                        get_buffer(pb, rm->audiobuf+sps*(h*x+((h+1)/2)*(y&1)+(y>>1)), sps);
+                        get_buffer(pb, ast->pkt.data+sps*(h*x+((h+1)/2)*(y&1)+(y>>1)), sps);
                     break;
             }
 
-            if (++(rm->sub_packet_cnt) < h)
+            if (++(ast->sub_packet_cnt) < h)
                 return -1;
             else {
-                rm->sub_packet_cnt = 0;
+                ast->sub_packet_cnt = 0;
                 rm->audio_stream_num = st->index;
                 rm->audio_pkt_cnt = h * w / st->codec->block_align - 1;
                 // Release first audio packet
                 av_new_packet(pkt, st->codec->block_align);
-                memcpy(pkt->data, rm->audiobuf, st->codec->block_align);
-                *timestamp = rm->audiotimestamp;
+                memcpy(pkt->data, ast->pkt.data, st->codec->block_align); //FIXME avoid this
+                *timestamp = ast->audiotimestamp;
                 *flags = 2; // Mark first packet as keyframe
             }
         } else if (st->codec->codec_id == CODEC_ID_AAC) {
             int x;
             rm->audio_stream_num = st->index;
-            rm->sub_packet_cnt = (get_be16(pb) & 0xf0) >> 4;
-            if (rm->sub_packet_cnt) {
-                for (x = 0; x < rm->sub_packet_cnt; x++)
-                    rm->sub_packet_lengths[x] = get_be16(pb);
+            ast->sub_packet_cnt = (get_be16(pb) & 0xf0) >> 4;
+            if (ast->sub_packet_cnt) {
+                for (x = 0; x < ast->sub_packet_cnt; x++)
+                    ast->sub_packet_lengths[x] = get_be16(pb);
                 // Release first audio packet
-                rm->audio_pkt_cnt = rm->sub_packet_cnt - 1;
-                av_get_packet(pb, pkt, rm->sub_packet_lengths[0]);
+                rm->audio_pkt_cnt = ast->sub_packet_cnt - 1;
+                av_get_packet(pb, pkt, ast->sub_packet_lengths[0]);
                 *flags = 2; // Mark first packet as keyframe
             }
         } else {
@@ -645,33 +671,35 @@ ff_rm_parse_packet (AVFormatContext *s, AVStream *st, int len, AVPacket *pkt,
     if (*flags & 2)
         pkt->flags |= PKT_FLAG_KEY;
 
-    return 0;
+    return st->codec->codec_type == CODEC_TYPE_AUDIO ? rm->audio_pkt_cnt : 0;
 }
 
-void
-ff_rm_retrieve_cache (AVFormatContext *s, AVStream *st, AVPacket *pkt)
+int
+ff_rm_retrieve_cache (AVFormatContext *s, ByteIOContext *pb,
+                      AVStream *st, RMStream *ast, AVPacket *pkt)
 {
-    ByteIOContext *pb = s->pb;
-    RMContext *rm = s->priv_data;
+    RMDemuxContext *rm = s->priv_data;
 
     assert (rm->audio_pkt_cnt > 0);
 
     if (st->codec->codec_id == CODEC_ID_AAC)
-        av_get_packet(pb, pkt, rm->sub_packet_lengths[rm->sub_packet_cnt - rm->audio_pkt_cnt]);
+        av_get_packet(pb, pkt, ast->sub_packet_lengths[ast->sub_packet_cnt - rm->audio_pkt_cnt]);
     else {
         av_new_packet(pkt, st->codec->block_align);
-        memcpy(pkt->data, rm->audiobuf + st->codec->block_align *
-               (rm->sub_packet_h * rm->audio_framesize / st->codec->block_align - rm->audio_pkt_cnt),
+        memcpy(pkt->data, ast->pkt.data + st->codec->block_align * //FIXME avoid this
+               (ast->sub_packet_h * ast->audio_framesize / st->codec->block_align - rm->audio_pkt_cnt),
                st->codec->block_align);
     }
     rm->audio_pkt_cnt--;
     pkt->flags = 0;
     pkt->stream_index = st->index;
+
+    return rm->audio_pkt_cnt;
 }
 
 static int rm_read_packet(AVFormatContext *s, AVPacket *pkt)
 {
-    RMContext *rm = s->priv_data;
+    RMDemuxContext *rm = s->priv_data;
     ByteIOContext *pb = s->pb;
     AVStream *st;
     int i, len;
@@ -681,21 +709,24 @@ static int rm_read_packet(AVFormatContext *s, AVPacket *pkt)
     if (rm->audio_pkt_cnt) {
         // If there are queued audio packet return them first
         st = s->streams[rm->audio_stream_num];
-        ff_rm_retrieve_cache(s, st, pkt);
+        ff_rm_retrieve_cache(s, s->pb, st, st->priv_data, pkt);
     } else if (rm->old_format) {
+        RMStream *ast;
+
         st = s->streams[0];
+        ast = st->priv_data;
         if (st->codec->codec_id == CODEC_ID_RA_288) {
             int x, y;
 
-            for (y = 0; y < rm->sub_packet_h; y++)
-                for (x = 0; x < rm->sub_packet_h/2; x++)
-                    if (get_buffer(pb, rm->audiobuf+x*2*rm->audio_framesize+y*rm->coded_framesize, rm->coded_framesize) <= 0)
+            for (y = 0; y < ast->sub_packet_h; y++)
+                for (x = 0; x < ast->sub_packet_h/2; x++)
+                    if (get_buffer(pb, ast->pkt.data+x*2*ast->audio_framesize+y*ast->coded_framesize, ast->coded_framesize) <= 0)
                         return AVERROR(EIO);
             rm->audio_stream_num = 0;
-            rm->audio_pkt_cnt = rm->sub_packet_h * rm->audio_framesize / st->codec->block_align - 1;
+            rm->audio_pkt_cnt = ast->sub_packet_h * ast->audio_framesize / st->codec->block_align - 1;
             // Release first audio packet
             av_new_packet(pkt, st->codec->block_align);
-            memcpy(pkt->data, rm->audiobuf, st->codec->block_align);
+            memcpy(pkt->data, ast->pkt.data, st->codec->block_align); //FIXME avoid this
             pkt->flags |= PKT_FLAG_KEY; // Mark first packet as keyframe
             pkt->stream_index = 0;
         } else {
@@ -717,7 +748,8 @@ resync:
             return AVERROR(EIO);
         st = s->streams[i];
 
-        if (ff_rm_parse_packet (s, st, len, pkt, &seq, &flags, &timestamp) < 0)
+        if (ff_rm_parse_packet (s, s->pb, st, st->priv_data, len, pkt,
+                                &seq, &flags, &timestamp) < 0)
             goto resync;
 
         if((flags&2) && (seq&0x7F) == 1)
@@ -729,10 +761,11 @@ resync:
 
 static int rm_read_close(AVFormatContext *s)
 {
-    RMContext *rm = s->priv_data;
+    int i;
+
+    for (i=0;i<s->nb_streams;i++)
+        ff_rm_free_rmstream(s->streams[i]->priv_data);
 
-    av_free(rm->audiobuf);
-    av_free(rm->videobuf);
     return 0;
 }
 
@@ -752,7 +785,7 @@ static int rm_probe(AVProbeData *p)
 static int64_t rm_read_dts(AVFormatContext *s, int stream_index,
                                int64_t *ppos, int64_t pos_limit)
 {
-    RMContext *rm = s->priv_data;
+    RMDemuxContext *rm = s->priv_data;
     int64_t pos, dts;
     int stream_index2, flags, len, h;
 
@@ -795,7 +828,7 @@ static int64_t rm_read_dts(AVFormatContext *s, int stream_index,
 AVInputFormat rm_demuxer = {
     "rm",
     NULL_IF_CONFIG_SMALL("RM format"),
-    sizeof(RMContext),
+    sizeof(RMDemuxContext),
     rm_probe,
     rm_read_header,
     rm_read_packet,
@@ -807,6 +840,6 @@ AVInputFormat rm_demuxer = {
 AVInputFormat rdt_demuxer = {
     "rdt",
     NULL_IF_CONFIG_SMALL("RDT demuxer"),
-    sizeof(RMContext),
+    sizeof(RMDemuxContext),
     NULL, NULL, NULL, rm_read_close, NULL, NULL
 };
diff --git a/libavformat/rmenc.c b/libavformat/rmenc.c
index 0e61f02..84a265a 100644
--- a/libavformat/rmenc.c
+++ b/libavformat/rmenc.c
@@ -21,6 +21,25 @@
 #include "avformat.h"
 #include "rm.h"
 
+typedef struct {
+    int nb_packets;
+    int packet_total_size;
+    int packet_max_size;
+    /* codec related output */
+    int bit_rate;
+    float frame_rate;
+    int nb_frames;    /* current frame number */
+    int total_frames; /* total number of frames */
+    int num;
+    AVCodecContext *enc;
+} StreamInfo;
+
+typedef struct {
+    StreamInfo streams[2];
+    StreamInfo *audio_stream, *video_stream;
+    int data_pos; /* position of the data after the header */
+} RMMuxContext;
+
 /* in ms */
 #define BUFFER_DURATION 0
 
@@ -44,7 +63,7 @@ static void put_str8(ByteIOContext *s, const char *tag)
 static void rv10_write_header(AVFormatContext *ctx,
                               int data_size, int index_pos)
 {
-    RMContext *rm = ctx->priv_data;
+    RMMuxContext *rm = ctx->priv_data;
     ByteIOContext *s = ctx->pb;
     StreamInfo *stream;
     unsigned char *data_offset_ptr, *start_ptr;
@@ -271,7 +290,7 @@ static void write_packet_header(AVFormatContext *ctx, StreamInfo *stream,
 
 static int rm_write_header(AVFormatContext *s)
 {
-    RMContext *rm = s->priv_data;
+    RMMuxContext *rm = s->priv_data;
     StreamInfo *stream;
     int n;
     AVCodecContext *codec;
@@ -315,7 +334,7 @@ static int rm_write_header(AVFormatContext *s)
 static int rm_write_audio(AVFormatContext *s, const uint8_t *buf, int size, int flags)
 {
     uint8_t *buf1;
-    RMContext *rm = s->priv_data;
+    RMMuxContext *rm = s->priv_data;
     ByteIOContext *pb = s->pb;
     StreamInfo *stream = rm->audio_stream;
     int i;
@@ -339,7 +358,7 @@ static int rm_write_audio(AVFormatContext *s, const uint8_t *buf, int size, int
 
 static int rm_write_video(AVFormatContext *s, const uint8_t *buf, int size, int flags)
 {
-    RMContext *rm = s->priv_data;
+    RMMuxContext *rm = s->priv_data;
     ByteIOContext *pb = s->pb;
     StreamInfo *stream = rm->video_stream;
     int key_frame = !!(flags & PKT_FLAG_KEY);
@@ -393,7 +412,7 @@ static int rm_write_packet(AVFormatContext *s, AVPacket *pkt)
 
 static int rm_write_trailer(AVFormatContext *s)
 {
-    RMContext *rm = s->priv_data;
+    RMMuxContext *rm = s->priv_data;
     int data_size, index_pos, i;
     ByteIOContext *pb = s->pb;
 
@@ -435,7 +454,7 @@ AVOutputFormat rm_muxer = {
     NULL_IF_CONFIG_SMALL("RM format"),
     "application/vnd.rn-realmedia",
     "rm,ra",
-    sizeof(RMContext),
+    sizeof(RMMuxContext),
     CODEC_ID_AC3,
     CODEC_ID_RV10,
     rm_write_header,
diff --git a/libavformat/rtp.h b/libavformat/rtp.h
index 4de5919..7819ceb 100644
--- a/libavformat/rtp.h
+++ b/libavformat/rtp.h
@@ -24,6 +24,33 @@
 #include "libavcodec/avcodec.h"
 #include "avformat.h"
 
+/** Structure listing useful vars to parse RTP packet payload*/
+typedef struct rtp_payload_data
+{
+    int sizelength;
+    int indexlength;
+    int indexdeltalength;
+    int profile_level_id;
+    int streamtype;
+    int objecttype;
+    char *mode;
+
+    /** mpeg 4 AU headers */
+    struct AUHeaders {
+        int size;
+        int index;
+        int cts_flag;
+        int cts;
+        int dts_flag;
+        int dts;
+        int rap_flag;
+        int streamstate;
+    } *au_headers;
+    int nb_au_headers;
+    int au_headers_length_bytes;
+    int cur_au_index;
+} RTPPayloadData;
+
 typedef struct PayloadContext PayloadContext;
 typedef struct RTPDynamicProtocolHandler_s RTPDynamicProtocolHandler;
 
@@ -36,8 +63,7 @@ int rtp_get_codec_info(AVCodecContext *codec, int payload_type);
 int rtp_get_payload_type(AVCodecContext *codec);
 
 typedef struct RTPDemuxContext RTPDemuxContext;
-typedef struct rtp_payload_data_s rtp_payload_data_s;
-RTPDemuxContext *rtp_parse_open(AVFormatContext *s1, AVStream *st, URLContext *rtpc, int payload_type, rtp_payload_data_s *rtp_payload_data);
+RTPDemuxContext *rtp_parse_open(AVFormatContext *s1, AVStream *st, URLContext *rtpc, int payload_type, RTPPayloadData *rtp_payload_data);
 void rtp_parse_set_dynamic_protocol(RTPDemuxContext *s, PayloadContext *ctx,
                                     RTPDynamicProtocolHandler *handler);
 int rtp_parse_packet(RTPDemuxContext *s, AVPacket *pkt,
@@ -63,56 +89,4 @@ int rtp_check_and_send_back_rr(RTPDemuxContext *s, int count);
 #define RTCP_TX_RATIO_NUM 5
 #define RTCP_TX_RATIO_DEN 1000
 
-/** Structure listing useful vars to parse RTP packet payload*/
-typedef struct rtp_payload_data_s
-{
-    int sizelength;
-    int indexlength;
-    int indexdeltalength;
-    int profile_level_id;
-    int streamtype;
-    int objecttype;
-    char *mode;
-
-    /** mpeg 4 AU headers */
-    struct AUHeaders {
-        int size;
-        int index;
-        int cts_flag;
-        int cts;
-        int dts_flag;
-        int dts;
-        int rap_flag;
-        int streamstate;
-    } *au_headers;
-    int nb_au_headers;
-    int au_headers_length_bytes;
-    int cur_au_index;
-} rtp_payload_data_t;
-
-#if 0
-typedef enum {
-  RTCP_SR   = 200,
-  RTCP_RR   = 201,
-  RTCP_SDES = 202,
-  RTCP_BYE  = 203,
-  RTCP_APP  = 204
-} rtcp_type_t;
-
-typedef enum {
-  RTCP_SDES_END    =  0,
-  RTCP_SDES_CNAME  =  1,
-  RTCP_SDES_NAME   =  2,
-  RTCP_SDES_EMAIL  =  3,
-  RTCP_SDES_PHONE  =  4,
-  RTCP_SDES_LOC    =  5,
-  RTCP_SDES_TOOL   =  6,
-  RTCP_SDES_NOTE   =  7,
-  RTCP_SDES_PRIV   =  8,
-  RTCP_SDES_IMG    =  9,
-  RTCP_SDES_DOOR   = 10,
-  RTCP_SDES_SOURCE = 11
-} rtcp_sdes_type_t;
-#endif
-
 #endif /* AVFORMAT_RTP_H */
diff --git a/libavformat/rtp_h264.c b/libavformat/rtp_h264.c
index bc29852..7d6f96c 100644
--- a/libavformat/rtp_h264.c
+++ b/libavformat/rtp_h264.c
@@ -348,9 +348,10 @@ static void h264_free_extradata(PayloadContext *data)
     av_free(data);
 }
 
-static int parse_h264_sdp_line(AVStream * stream, PayloadContext *h264_data,
-                               const char *line)
+static int parse_h264_sdp_line(AVFormatContext *s, int st_index,
+                               PayloadContext *h264_data, const char *line)
 {
+    AVStream *stream = s->streams[st_index];
     AVCodecContext *codec = stream->codec;
     const char *p = line;
 
diff --git a/libavformat/rtp_internal.h b/libavformat/rtp_internal.h
index 80c81cb..6b7588f 100644
--- a/libavformat/rtp_internal.h
+++ b/libavformat/rtp_internal.h
@@ -66,7 +66,8 @@ struct RTPDynamicProtocolHandler_s {
     enum CodecID codec_id;
 
     // may be null
-    int (*parse_sdp_a_line) (AVStream * stream,
+    int (*parse_sdp_a_line) (AVFormatContext *s,
+                             int st_index,
                              PayloadContext *priv_data,
                              const char *line); ///< Parse the a= line from the sdp field
     PayloadContext *(*open) (); ///< allocate any data needed by the rtp parsing for this dynamic data.
@@ -111,7 +112,7 @@ struct RTPDemuxContext {
     uint8_t *buf_ptr;
 
     /* special infos for au headers parsing */
-    rtp_payload_data_t *rtp_payload_data; // TODO: Move into dynamic payload handlers
+    RTPPayloadData *rtp_payload_data; // TODO: Move into dynamic payload handlers
 
     /* dynamic payload stuff */
     DynamicPayloadPacketHandlerProc parse_packet;     ///< This is also copied from the dynamic protocol handler structure
diff --git a/libavformat/rtpdec.c b/libavformat/rtpdec.c
index b08509c..7d8fc2c 100644
--- a/libavformat/rtpdec.c
+++ b/libavformat/rtpdec.c
@@ -267,7 +267,7 @@ int rtp_check_and_send_back_rr(RTPDemuxContext *s, int count)
  * rtp demux (otherwise CODEC_ID_MPEG2TS packets are returned)
  * TODO: change this to not take rtp_payload data, and use the new dynamic payload system.
  */
-RTPDemuxContext *rtp_parse_open(AVFormatContext *s1, AVStream *st, URLContext *rtpc, int payload_type, rtp_payload_data_t *rtp_payload_data)
+RTPDemuxContext *rtp_parse_open(AVFormatContext *s1, AVStream *st, URLContext *rtpc, int payload_type, RTPPayloadData *rtp_payload_data)
 {
     RTPDemuxContext *s;
 
@@ -323,7 +323,7 @@ static int rtp_parse_mp4_au(RTPDemuxContext *s, const uint8_t *buf)
 {
     int au_headers_length, au_header_size, i;
     GetBitContext getbitcontext;
-    rtp_payload_data_t *infos;
+    RTPPayloadData *infos;
 
     infos = s->rtp_payload_data;
 
@@ -512,7 +512,7 @@ int rtp_parse_packet(RTPDemuxContext *s, AVPacket *pkt,
             if (rtp_parse_mp4_au(s, buf))
                 return -1;
             {
-                rtp_payload_data_t *infos = s->rtp_payload_data;
+                RTPPayloadData *infos = s->rtp_payload_data;
                 if (infos == NULL)
                     return -1;
                 buf += infos->au_headers_length_bytes + 2;
diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c
index f7596d2..464cffe 100644
--- a/libavformat/rtsp.c
+++ b/libavformat/rtsp.c
@@ -39,62 +39,6 @@
 //#define DEBUG
 //#define DEBUG_RTP_TCP
 
-enum RTSPClientState {
-    RTSP_STATE_IDLE,
-    RTSP_STATE_PLAYING,
-    RTSP_STATE_PAUSED,
-};
-
-enum RTSPServerType {
-    RTSP_SERVER_RTP,  /*< Standard-compliant RTP-server */
-    RTSP_SERVER_REAL, /*< Realmedia-style server */
-    RTSP_SERVER_LAST
-};
-
-enum RTSPTransport {
-    RTSP_TRANSPORT_RTP,
-    RTSP_TRANSPORT_RDT,
-    RTSP_TRANSPORT_LAST
-};
-
-typedef struct RTSPState {
-    URLContext *rtsp_hd; /* RTSP TCP connexion handle */
-    int nb_rtsp_streams;
-    struct RTSPStream **rtsp_streams;
-
-    enum RTSPClientState state;
-    int64_t seek_timestamp;
-
-    /* XXX: currently we use unbuffered input */
-    //    ByteIOContext rtsp_gb;
-    int seq;        /* RTSP command sequence number */
-    char session_id[512];
-    enum RTSPTransport transport;
-    enum RTSPLowerTransport lower_transport;
-    enum RTSPServerType server_type;
-    char last_reply[2048]; /* XXX: allocate ? */
-    void *cur_tx;
-    int need_subscription;
-} RTSPState;
-
-typedef struct RTSPStream {
-    URLContext *rtp_handle; /* RTP stream handle */
-    void *tx_ctx; /* RTP/RDT parse context */
-
-    int stream_index; /* corresponding stream index, if any. -1 if none (MPEG2TS case) */
-    int interleaved_min, interleaved_max;  /* interleave ids, if TCP transport */
-    char control_url[1024]; /* url for this stream (from SDP) */
-
-    int sdp_port; /* port (from SDP content - not used in RTSP) */
-    struct in_addr sdp_ip; /* IP address  (from SDP content - not used in RTSP) */
-    int sdp_ttl;  /* IP TTL (from SDP content - not used in RTSP) */
-    int sdp_payload_type; /* payload type - only used in SDP */
-    rtp_payload_data_t rtp_payload_data; /* rtp payload parsing infos from SDP */
-
-    RTPDynamicProtocolHandler *dynamic_handler; ///< Only valid if it's a dynamic protocol. (This is the handler structure)
-    PayloadContext *dynamic_protocol_context; ///< Only valid if it's a dynamic protocol. (This is any private data associated with the dynamic protocol)
-} RTSPStream;
-
 static int rtsp_read_play(AVFormatContext *s);
 
 /* XXX: currently, the only way to change the protocols consists in
@@ -283,24 +227,23 @@ static void sdp_parse_fmtp_config(AVCodecContext *codec, char *attr, char *value
     return;
 }
 
-typedef struct attrname_map
-{
+typedef struct {
     const char *str;
     uint16_t type;
     uint32_t offset;
-} attrname_map_t;
+} AttrNameMap;
 
 /* All known fmtp parmeters and the corresping RTPAttrTypeEnum */
 #define ATTR_NAME_TYPE_INT 0
 #define ATTR_NAME_TYPE_STR 1
-static const attrname_map_t attr_names[]=
+static const AttrNameMap attr_names[]=
 {
-    {"SizeLength",       ATTR_NAME_TYPE_INT, offsetof(rtp_payload_data_t, sizelength)},
-    {"IndexLength",      ATTR_NAME_TYPE_INT, offsetof(rtp_payload_data_t, indexlength)},
-    {"IndexDeltaLength", ATTR_NAME_TYPE_INT, offsetof(rtp_payload_data_t, indexdeltalength)},
-    {"profile-level-id", ATTR_NAME_TYPE_INT, offsetof(rtp_payload_data_t, profile_level_id)},
-    {"StreamType",       ATTR_NAME_TYPE_INT, offsetof(rtp_payload_data_t, streamtype)},
-    {"mode",             ATTR_NAME_TYPE_STR, offsetof(rtp_payload_data_t, mode)},
+    {"SizeLength",       ATTR_NAME_TYPE_INT, offsetof(RTPPayloadData, sizelength)},
+    {"IndexLength",      ATTR_NAME_TYPE_INT, offsetof(RTPPayloadData, indexlength)},
+    {"IndexDeltaLength", ATTR_NAME_TYPE_INT, offsetof(RTPPayloadData, indexdeltalength)},
+    {"profile-level-id", ATTR_NAME_TYPE_INT, offsetof(RTPPayloadData, profile_level_id)},
+    {"StreamType",       ATTR_NAME_TYPE_INT, offsetof(RTPPayloadData, streamtype)},
+    {"mode",             ATTR_NAME_TYPE_STR, offsetof(RTPPayloadData, mode)},
     {NULL, -1, -1},
 };
 
@@ -332,7 +275,7 @@ static void sdp_parse_fmtp(AVStream *st, const char *p)
 
     RTSPStream *rtsp_st = st->priv_data;
     AVCodecContext *codec = st->codec;
-    rtp_payload_data_t *rtp_payload_data = &rtsp_st->rtp_payload_data;
+    RTPPayloadData *rtp_payload_data = &rtsp_st->rtp_payload_data;
 
     /* loop on each attribute */
     while(rtsp_next_attr_and_value(&p, attr, sizeof(attr), value, sizeof(value)))
@@ -518,7 +461,7 @@ static void sdp_parse_line(AVFormatContext *s, SDPParseState *s1,
                 rtsp_st = st->priv_data;
                 if (rtsp_st->sdp_payload_type == payload_type) {
                     if(rtsp_st->dynamic_handler && rtsp_st->dynamic_handler->parse_sdp_a_line) {
-                        if(!rtsp_st->dynamic_handler->parse_sdp_a_line(st, rtsp_st->dynamic_protocol_context, buf)) {
+                        if(!rtsp_st->dynamic_handler->parse_sdp_a_line(s, i, rtsp_st->dynamic_protocol_context, buf)) {
                             sdp_parse_fmtp(st, p);
                         }
                     } else {
@@ -535,7 +478,7 @@ static void sdp_parse_line(AVFormatContext *s, SDPParseState *s1,
                 rtsp_st = st->priv_data;
                 if (rtsp_st->sdp_payload_type == payload_type) {
                     if(rtsp_st->dynamic_handler && rtsp_st->dynamic_handler->parse_sdp_a_line) {
-                        rtsp_st->dynamic_handler->parse_sdp_a_line(st, rtsp_st->dynamic_protocol_context, buf);
+                        rtsp_st->dynamic_handler->parse_sdp_a_line(s, i, rtsp_st->dynamic_protocol_context, buf);
                     }
                 }
             }
@@ -550,10 +493,13 @@ static void sdp_parse_line(AVFormatContext *s, SDPParseState *s1,
             if (atoi(p) == 1)
                 rt->transport = RTSP_TRANSPORT_RDT;
         } else if (s->nb_streams > 0) {
+            if (rt->server_type == RTSP_SERVER_REAL)
+                ff_real_parse_sdp_a_line(s, s->nb_streams - 1, p);
+
             rtsp_st = s->streams[s->nb_streams - 1]->priv_data;
             if (rtsp_st->dynamic_handler &&
                 rtsp_st->dynamic_handler->parse_sdp_a_line)
-                rtsp_st->dynamic_handler->parse_sdp_a_line(s->streams[s->nb_streams - 1],
+                rtsp_st->dynamic_handler->parse_sdp_a_line(s, s->nb_streams - 1,
                     rtsp_st->dynamic_protocol_context, buf);
         }
         break;
@@ -564,7 +510,11 @@ static int sdp_parse(AVFormatContext *s, const char *content)
 {
     const char *p;
     int letter;
-    char buf[2048], *q;
+    /* Some SDP lines, particularly for Realmedia or ASF RTSP streams, contain long SDP
+     * lines containing complete ASF Headers (several kB) or arrays of MDPR (RM stream
+     * descriptor) headers plus "rulebooks" describing their properties. Therefore, the
+     * SDP line buffer is large. */
+    char buf[8192], *q;
     SDPParseState sdp_parse_state, *s1 = &sdp_parse_state;
 
     memset(s1, 0, sizeof(SDPParseState));
@@ -738,6 +688,9 @@ void rtsp_parse_line(RTSPHeader *reply, const char *buf)
     } else if (av_stristart(p, "RealChallenge1:", &p)) {
         skip_spaces(&p);
         av_strlcpy(reply->real_challenge, p, sizeof(reply->real_challenge));
+    } else if (av_stristart(p, "Server:", &p)) {
+        skip_spaces(&p);
+        av_strlcpy(reply->server, p, sizeof(reply->server));
     }
 }
 
@@ -901,7 +854,7 @@ rtsp_open_transport_ctx(AVFormatContext *s, RTSPStream *rtsp_st)
         s->ctx_flags |= AVFMTCTX_NOHEADER;
 
     if (rt->transport == RTSP_TRANSPORT_RDT)
-        rtsp_st->tx_ctx = ff_rdt_parse_open(s, st,
+        rtsp_st->tx_ctx = ff_rdt_parse_open(s, st->index,
                                             rtsp_st->dynamic_protocol_context,
                                             rtsp_st->dynamic_handler);
     else
@@ -1173,6 +1126,8 @@ static int rtsp_read_header(AVFormatContext *s,
         if (rt->server_type != RTSP_SERVER_REAL && reply->real_challenge[0]) {
             rt->server_type = RTSP_SERVER_REAL;
             continue;
+        } else if (!strncasecmp(reply->server, "WMServer/", 9)) {
+            rt->server_type = RTSP_SERVER_WMS;
         } else if (rt->server_type == RTSP_SERVER_REAL) {
             strcpy(real_challenge, reply->real_challenge);
         }
@@ -1281,7 +1236,7 @@ static int tcp_read_packet(AVFormatContext *s, RTSPStream **prtsp_st,
     if (ret != len)
         return -1;
     if (rt->transport == RTSP_TRANSPORT_RDT &&
-        ff_rdt_parse_header(buf, len, &id, NULL, NULL, NULL) < 0)
+        ff_rdt_parse_header(buf, len, &id, NULL, NULL, NULL, NULL) < 0)
         return -1;
 
     /* find the matching stream */
@@ -1344,33 +1299,68 @@ static int rtsp_read_packet(AVFormatContext *s,
     RTSPState *rt = s->priv_data;
     RTSPStream *rtsp_st;
     int ret, len;
-    uint8_t buf[RTP_MAX_PACKET_LENGTH];
+    uint8_t buf[10 * RTP_MAX_PACKET_LENGTH];
 
-    if (rt->server_type == RTSP_SERVER_REAL && rt->need_subscription) {
+    if (rt->server_type == RTSP_SERVER_REAL) {
         int i;
         RTSPHeader reply1, *reply = &reply1;
+        enum AVDiscard cache[MAX_STREAMS];
         char cmd[1024];
 
-        snprintf(cmd, sizeof(cmd),
-                 "SET_PARAMETER %s RTSP/1.0\r\n"
-                 "Subscribe: ",
-                 s->filename);
-        for (i = 0; i < rt->nb_rtsp_streams; i++) {
-            if (i != 0) av_strlcat(cmd, ",", sizeof(cmd));
-            ff_rdt_subscribe_rule(cmd, sizeof(cmd), i, 0);
-            if (rt->transport == RTSP_TRANSPORT_RDT)
-                ff_rdt_subscribe_rule2(
-                    rt->rtsp_streams[i]->tx_ctx,
-                    cmd, sizeof(cmd), i, 0);
+        for (i = 0; i < s->nb_streams; i++)
+            cache[i] = s->streams[i]->discard;
+
+        if (!rt->need_subscription) {
+            if (memcmp (cache, rt->real_setup_cache,
+                        sizeof(enum AVDiscard) * s->nb_streams)) {
+                av_strlcatf(cmd, sizeof(cmd),
+                            "SET_PARAMETER %s RTSP/1.0\r\n"
+                            "Unsubscribe: %s\r\n",
+                            s->filename, rt->last_subscription);
+                rtsp_send_cmd(s, cmd, reply, NULL);
+                if (reply->status_code != RTSP_STATUS_OK)
+                    return AVERROR_INVALIDDATA;
+                rt->need_subscription = 1;
+            }
         }
-        av_strlcat(cmd, "\r\n", sizeof(cmd));
-        rtsp_send_cmd(s, cmd, reply, NULL);
-        if (reply->status_code != RTSP_STATUS_OK)
-            return AVERROR_INVALIDDATA;
-        rt->need_subscription = 0;
 
-        if (rt->state == RTSP_STATE_PLAYING)
-            rtsp_read_play (s);
+        if (rt->need_subscription) {
+            int r, rule_nr, first = 1;
+
+            memcpy(rt->real_setup_cache, cache,
+                   sizeof(enum AVDiscard) * s->nb_streams);
+            rt->last_subscription[0] = 0;
+
+            snprintf(cmd, sizeof(cmd),
+                     "SET_PARAMETER %s RTSP/1.0\r\n"
+                     "Subscribe: ",
+                     s->filename);
+            for (i = 0; i < rt->nb_rtsp_streams; i++) {
+                rule_nr = 0;
+                for (r = 0; r < s->nb_streams; r++) {
+                    if (s->streams[r]->priv_data == rt->rtsp_streams[i]) {
+                        if (s->streams[r]->discard != AVDISCARD_ALL) {
+                            if (!first)
+                                av_strlcat(rt->last_subscription, ",",
+                                           sizeof(rt->last_subscription));
+                            ff_rdt_subscribe_rule(
+                                rt->last_subscription,
+                                sizeof(rt->last_subscription), i, rule_nr);
+                            first = 0;
+                        }
+                        rule_nr++;
+                    }
+                }
+            }
+            av_strlcatf(cmd, sizeof(cmd), "%s\r\n", rt->last_subscription);
+            rtsp_send_cmd(s, cmd, reply, NULL);
+            if (reply->status_code != RTSP_STATUS_OK)
+                return AVERROR_INVALIDDATA;
+            rt->need_subscription = 0;
+
+            if (rt->state == RTSP_STATE_PLAYING)
+                rtsp_read_play (s);
+        }
     }
 
     /* get next frames from the same RTP packet */
@@ -1399,7 +1389,7 @@ static int rtsp_read_packet(AVFormatContext *s,
     case RTSP_LOWER_TRANSPORT_UDP:
     case RTSP_LOWER_TRANSPORT_UDP_MULTICAST:
         len = udp_read_packet(s, &rtsp_st, buf, sizeof(buf));
-        if (len >=0 && rtsp_st->tx_ctx)
+        if (len >=0 && rtsp_st->tx_ctx && rt->transport == RTSP_TRANSPORT_RTP)
             rtp_check_and_send_back_rr(rtsp_st->tx_ctx, len);
         break;
     }
diff --git a/libavformat/rtsp.h b/libavformat/rtsp.h
index 611f5c3..ec3477b 100644
--- a/libavformat/rtsp.h
+++ b/libavformat/rtsp.h
@@ -24,6 +24,8 @@
 #include <stdint.h>
 #include "avformat.h"
 #include "rtspcodes.h"
+#include "rtp.h"
+#include "network.h"
 
 enum RTSPLowerTransport {
     RTSP_LOWER_TRANSPORT_UDP = 0,
@@ -64,8 +66,68 @@ typedef struct RTSPHeader {
     int seq; /**< sequence number */
     char session_id[512];
     char real_challenge[64]; /**< the RealChallenge1 field from the server */
+    char server[64];
 } RTSPHeader;
 
+enum RTSPClientState {
+    RTSP_STATE_IDLE,
+    RTSP_STATE_PLAYING,
+    RTSP_STATE_PAUSED,
+};
+
+enum RTSPServerType {
+    RTSP_SERVER_RTP,  /**< Standards-compliant RTP-server */
+    RTSP_SERVER_REAL, /**< Realmedia-style server */
+    RTSP_SERVER_WMS,  /**< Windows Media server */
+    RTSP_SERVER_LAST
+};
+
+enum RTSPTransport {
+    RTSP_TRANSPORT_RTP,
+    RTSP_TRANSPORT_RDT,
+    RTSP_TRANSPORT_LAST
+};
+
+typedef struct RTSPState {
+    URLContext *rtsp_hd; /* RTSP TCP connexion handle */
+    int nb_rtsp_streams;
+    struct RTSPStream **rtsp_streams;
+
+    enum RTSPClientState state;
+    int64_t seek_timestamp;
+
+    /* XXX: currently we use unbuffered input */
+    //    ByteIOContext rtsp_gb;
+    int seq;        /* RTSP command sequence number */
+    char session_id[512];
+    enum RTSPTransport transport;
+    enum RTSPLowerTransport lower_transport;
+    enum RTSPServerType server_type;
+    char last_reply[2048]; /* XXX: allocate ? */
+    void *cur_tx;
+    int need_subscription;
+    enum AVDiscard real_setup_cache[MAX_STREAMS];
+    char last_subscription[1024];
+} RTSPState;
+
+typedef struct RTSPStream {
+    URLContext *rtp_handle; /* RTP stream handle */
+    void *tx_ctx; /* RTP/RDT parse context */
+
+    int stream_index; /* corresponding stream index, if any. -1 if none (MPEG2TS case) */
+    int interleaved_min, interleaved_max;  /* interleave ids, if TCP transport */
+    char control_url[1024]; /* url for this stream (from SDP) */
+
+    int sdp_port; /* port (from SDP content - not used in RTSP) */
+    struct in_addr sdp_ip; /* IP address  (from SDP content - not used in RTSP) */
+    int sdp_ttl;  /* IP TTL (from SDP content - not used in RTSP) */
+    int sdp_payload_type; /* payload type - only used in SDP */
+    RTPPayloadData rtp_payload_data; /* rtp payload parsing infos from SDP */
+
+    RTPDynamicProtocolHandler *dynamic_handler; ///< Only valid if it's a dynamic protocol. (This is the handler structure)
+    PayloadContext *dynamic_protocol_context; ///< Only valid if it's a dynamic protocol. (This is any private data associated with the dynamic protocol)
+} RTSPStream;
+
 /** the callback can be used to extend the connection setup/teardown step */
 enum RTSPCallbackAction {
     RTSP_ACTION_SERVER_SETUP,
diff --git a/libavformat/sdp.c b/libavformat/sdp.c
index dfbe52e..d22f516 100644
--- a/libavformat/sdp.c
+++ b/libavformat/sdp.c
@@ -43,7 +43,7 @@ struct sdp_session_level {
     const char *name;     /**< session name (can be an empty string) */
 };
 
-static void dest_write(char *buff, int size, const char *dest_addr, int ttl)
+static void sdp_write_address(char *buff, int size, const char *dest_addr, int ttl)
 {
     if (dest_addr) {
         if (ttl > 0) {
@@ -65,10 +65,10 @@ static void sdp_write_header(char *buff, int size, struct sdp_session_level *s)
                             s->id, s->version, s->src_addr,
                             s->start_time, s->end_time,
                             s->name[0] ? s->name : "No Name");
-    dest_write(buff, size, s->dst_addr, s->ttl);
+    sdp_write_address(buff, size, s->dst_addr, s->ttl);
 }
 
-static int get_address(char *dest_addr, int size, int *ttl, const char *url)
+static int sdp_get_address(char *dest_addr, int size, int *ttl, const char *url)
 {
     int port;
     const char *p;
@@ -157,7 +157,7 @@ static char *extradata2config(AVCodecContext *c)
     return config;
 }
 
-static char *sdp_media_attributes(char *buff, int size, AVCodecContext *c, int payload_type)
+static char *sdp_write_media_attributes(char *buff, int size, AVCodecContext *c, int payload_type)
 {
     char *config = NULL;
 
@@ -246,12 +246,12 @@ static void sdp_write_media(char *buff, int size, AVCodecContext *c, const char
     }
 
     av_strlcatf(buff, size, "m=%s %d RTP/AVP %d\r\n", type, port, payload_type);
-    dest_write(buff, size, dest_addr, ttl);
+    sdp_write_address(buff, size, dest_addr, ttl);
     if (c->bit_rate) {
         av_strlcatf(buff, size, "b=AS:%d\r\n", c->bit_rate / 1000);
     }
 
-    sdp_media_attributes(buff, size, c, payload_type);
+    sdp_write_media_attributes(buff, size, c, payload_type);
 }
 
 int avf_sdp_create(AVFormatContext *ac[], int n_files, char *buff, int size)
@@ -269,7 +269,7 @@ int avf_sdp_create(AVFormatContext *ac[], int n_files, char *buff, int size)
     port = 0;
     ttl = 0;
     if (n_files == 1) {
-        port = get_address(dst, sizeof(dst), &ttl, ac[0]->filename);
+        port = sdp_get_address(dst, sizeof(dst), &ttl, ac[0]->filename);
         if (port > 0) {
             s.dst_addr = dst;
             s.ttl = ttl;
@@ -280,7 +280,7 @@ int avf_sdp_create(AVFormatContext *ac[], int n_files, char *buff, int size)
     dst[0] = 0;
     for (i = 0; i < n_files; i++) {
         if (n_files != 1) {
-            port = get_address(dst, sizeof(dst), &ttl, ac[i]->filename);
+            port = sdp_get_address(dst, sizeof(dst), &ttl, ac[i]->filename);
         }
         for (j = 0; j < ac[i]->nb_streams; j++) {
             sdp_write_media(buff, size,
diff --git a/libavformat/segafilm.c b/libavformat/segafilm.c
index 27ff06b..1ec8c28 100644
--- a/libavformat/segafilm.c
+++ b/libavformat/segafilm.c
@@ -40,7 +40,7 @@ typedef struct {
   unsigned int sample_size;
   int64_t pts;
   int keyframe;
-} film_sample_t;
+} film_sample;
 
 typedef struct FilmDemuxContext {
     int video_stream_index;
@@ -53,7 +53,7 @@ typedef struct FilmDemuxContext {
 
     enum CodecID video_type;
     unsigned int sample_count;
-    film_sample_t *sample_table;
+    film_sample *sample_table;
     unsigned int current_sample;
 
     unsigned int base_clock;
@@ -163,9 +163,9 @@ static int film_read_header(AVFormatContext *s,
         return AVERROR_INVALIDDATA;
     film->base_clock = AV_RB32(&scratch[8]);
     film->sample_count = AV_RB32(&scratch[12]);
-    if(film->sample_count >= UINT_MAX / sizeof(film_sample_t))
+    if(film->sample_count >= UINT_MAX / sizeof(film_sample))
         return -1;
-    film->sample_table = av_malloc(film->sample_count * sizeof(film_sample_t));
+    film->sample_table = av_malloc(film->sample_count * sizeof(film_sample));
 
     for(i=0; i<s->nb_streams; i++)
         av_set_pts_info(s->streams[i], 33, 1, film->base_clock);
@@ -205,7 +205,7 @@ static int film_read_packet(AVFormatContext *s,
 {
     FilmDemuxContext *film = s->priv_data;
     ByteIOContext *pb = s->pb;
-    film_sample_t *sample;
+    film_sample *sample;
     int ret = 0;
     int i;
     int left, right;
diff --git a/libavformat/sierravmd.c b/libavformat/sierravmd.c
index 0a5f2fa..1e15a22 100644
--- a/libavformat/sierravmd.c
+++ b/libavformat/sierravmd.c
@@ -39,7 +39,7 @@ typedef struct {
   int64_t pts;
   int keyframe;
   unsigned char frame_record[BYTES_PER_FRAME_RECORD];
-} vmd_frame_t;
+} vmd_frame;
 
 typedef struct VmdDemuxContext {
     int video_stream_index;
@@ -47,8 +47,9 @@ typedef struct VmdDemuxContext {
 
     unsigned int frame_count;
     unsigned int frames_per_block;
-    vmd_frame_t *frame_table;
+    vmd_frame *frame_table;
     unsigned int current_frame;
+    int is_indeo3;
 
     int sample_rate;
     int64_t audio_sample_counter;
@@ -91,6 +92,10 @@ static int vmd_read_header(AVFormatContext *s,
     if (get_buffer(pb, vmd->vmd_header, VMD_HEADER_SIZE) != VMD_HEADER_SIZE)
         return AVERROR(EIO);
 
+    if(vmd->vmd_header[16] == 'i' && vmd->vmd_header[17] == 'v' && vmd->vmd_header[18] == '3')
+        vmd->is_indeo3 = 1;
+    else
+        vmd->is_indeo3 = 0;
     /* start up the decoders */
     vst = av_new_stream(s, 0);
     if (!vst)
@@ -98,10 +103,14 @@ static int vmd_read_header(AVFormatContext *s,
     av_set_pts_info(vst, 33, 1, 10);
     vmd->video_stream_index = vst->index;
     vst->codec->codec_type = CODEC_TYPE_VIDEO;
-    vst->codec->codec_id = CODEC_ID_VMDVIDEO;
+    vst->codec->codec_id = vmd->is_indeo3 ? CODEC_ID_INDEO3 : CODEC_ID_VMDVIDEO;
     vst->codec->codec_tag = 0;  /* no fourcc */
     vst->codec->width = AV_RL16(&vmd->vmd_header[12]);
     vst->codec->height = AV_RL16(&vmd->vmd_header[14]);
+    if(vmd->is_indeo3 && vst->codec->width > 320){
+        vst->codec->width >>= 1;
+        vst->codec->height >>= 1;
+    }
     vst->codec->extradata_size = VMD_HEADER_SIZE;
     vst->codec->extradata = av_mallocz(VMD_HEADER_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
     memcpy(vst->codec->extradata, vmd->vmd_header, VMD_HEADER_SIZE);
@@ -146,12 +155,12 @@ static int vmd_read_header(AVFormatContext *s,
     vmd->frame_table = NULL;
     sound_buffers = AV_RL16(&vmd->vmd_header[808]);
     raw_frame_table_size = vmd->frame_count * 6;
-    if(vmd->frame_count * vmd->frames_per_block  >= UINT_MAX / sizeof(vmd_frame_t)){
+    if(vmd->frame_count * vmd->frames_per_block  >= UINT_MAX / sizeof(vmd_frame)){
         av_log(s, AV_LOG_ERROR, "vmd->frame_count * vmd->frames_per_block too large\n");
         return -1;
     }
     raw_frame_table = av_malloc(raw_frame_table_size);
-    vmd->frame_table = av_malloc((vmd->frame_count * vmd->frames_per_block + sound_buffers) * sizeof(vmd_frame_t));
+    vmd->frame_table = av_malloc((vmd->frame_count * vmd->frames_per_block + sound_buffers) * sizeof(vmd_frame));
     if (!raw_frame_table || !vmd->frame_table) {
         av_free(raw_frame_table);
         av_free(vmd->frame_table);
@@ -248,7 +257,7 @@ static int vmd_read_packet(AVFormatContext *s,
     VmdDemuxContext *vmd = s->priv_data;
     ByteIOContext *pb = s->pb;
     int ret = 0;
-    vmd_frame_t *frame;
+    vmd_frame *frame;
 
     if (vmd->current_frame >= vmd->frame_count)
         return AVERROR(EIO);
@@ -261,8 +270,11 @@ static int vmd_read_packet(AVFormatContext *s,
         return AVERROR(ENOMEM);
     pkt->pos= url_ftell(pb);
     memcpy(pkt->data, frame->frame_record, BYTES_PER_FRAME_RECORD);
-    ret = get_buffer(pb, pkt->data + BYTES_PER_FRAME_RECORD,
-        frame->frame_size);
+    if(vmd->is_indeo3)
+        ret = get_buffer(pb, pkt->data, frame->frame_size);
+    else
+        ret = get_buffer(pb, pkt->data + BYTES_PER_FRAME_RECORD,
+            frame->frame_size);
 
     if (ret != frame->frame_size) {
         av_free_packet(pkt);
diff --git a/libavformat/udp.c b/libavformat/udp.c
index a8e8cd8..1101ffc 100644
--- a/libavformat/udp.c
+++ b/libavformat/udp.c
@@ -32,6 +32,7 @@
 #ifdef HAVE_SYS_SELECT_H
 #include <sys/select.h>
 #endif
+#include <sys/time.h>
 
 #ifndef IPV6_ADD_MEMBERSHIP
 #define IPV6_ADD_MEMBERSHIP IPV6_JOIN_GROUP
@@ -336,7 +337,7 @@ int udp_get_file_handle(URLContext *h)
 static int udp_open(URLContext *h, const char *uri, int flags)
 {
     char hostname[1024];
-    int port, udp_fd = -1, tmp;
+    int port, udp_fd = -1, tmp, bind_ret = -1;
     UDPContext *s = NULL;
     int is_output;
     const char *p;
@@ -404,7 +405,13 @@ static int udp_open(URLContext *h, const char *uri, int flags)
             goto fail;
 
     /* the bind is needed to give a port to the socket now */
-    if (bind(udp_fd,(struct sockaddr *)&my_addr, len) < 0)
+    /* if multicast, try the multicast address bind first */
+    if (s->is_multicast && !(h->flags & URL_WRONLY)) {
+        bind_ret = bind(udp_fd,(struct sockaddr *)&s->dest_addr, len);
+    }
+    /* bind to the local address if not multicast or if the multicast
+     * bind failed */
+    if (bind_ret < 0 && bind(udp_fd,(struct sockaddr *)&my_addr, len) < 0)
         goto fail;
 
     len = sizeof(my_addr);
@@ -437,6 +444,8 @@ static int udp_open(URLContext *h, const char *uri, int flags)
         if (setsockopt(udp_fd, SOL_SOCKET, SO_RCVBUF, &tmp, sizeof(tmp)) < 0) {
             av_log(NULL, AV_LOG_WARNING, "setsockopt(SO_RECVBUF): %s\n", strerror(errno));
         }
+        /* make the socket non-blocking */
+        ff_socket_nonblock(udp_fd, 1);
     }
 
     s->udp_fd = udp_fd;
@@ -468,7 +477,7 @@ static int udp_read(URLContext *h, uint8_t *buf, int size)
             return AVERROR(EIO);
         if (!(ret > 0 && FD_ISSET(s->udp_fd, &rfds)))
             continue;
-        len = recv(s->udp_fd, buf, size, MSG_DONTWAIT);
+        len = recv(s->udp_fd, buf, size, 0);
         if (len < 0) {
             if (ff_neterrno() != FF_NETERROR(EAGAIN) &&
                 ff_neterrno() != FF_NETERROR(EINTR))
diff --git a/libavformat/utils.c b/libavformat/utils.c
index 7a91b0f..9c3acfb 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -21,6 +21,7 @@
 #include "avformat.h"
 #include "internal.h"
 #include "libavcodec/opt.h"
+#include "metadata.h"
 #include "libavutil/avstring.h"
 #include "riff.h"
 #include <sys/time.h>
@@ -294,7 +295,7 @@ int av_get_packet(ByteIOContext *s, AVPacket *pkt, int size)
 
 int av_dup_packet(AVPacket *pkt)
 {
-    if (pkt->destruct != av_destruct_packet) {
+    if (((pkt->destruct == av_destruct_packet_nofree) || (pkt->destruct == NULL)) && pkt->data) {
         uint8_t *data;
         /* We duplicate the packet and don't forget to add the padding again. */
         if((unsigned)pkt->size > (unsigned)pkt->size + FF_INPUT_BUFFER_PADDING_SIZE)
@@ -484,6 +485,10 @@ int av_open_input_stream(AVFormatContext **ic_ptr,
     if (pb && !ic->data_offset)
         ic->data_offset = url_ftell(ic->pb);
 
+#if LIBAVFORMAT_VERSION_MAJOR < 53
+    ff_metadata_demux_compat(ic);
+#endif
+
     *ic_ptr = ic;
     return 0;
  fail:
@@ -2284,6 +2289,7 @@ void av_close_input_stream(AVFormatContext *s)
         if (st->parser) {
             av_parser_close(st->parser);
         }
+        av_metadata_free(&st->metadata);
         av_free(st->index_entries);
         av_free(st->codec->extradata);
         av_free(st->codec);
@@ -2294,6 +2300,7 @@ void av_close_input_stream(AVFormatContext *s)
     for(i=s->nb_programs-1; i>=0; i--) {
         av_freep(&s->programs[i]->provider_name);
         av_freep(&s->programs[i]->name);
+        av_metadata_free(&s->programs[i]->metadata);
         av_freep(&s->programs[i]->stream_index);
         av_freep(&s->programs[i]);
     }
@@ -2302,9 +2309,11 @@ void av_close_input_stream(AVFormatContext *s)
     av_freep(&s->priv_data);
     while(s->nb_chapters--) {
         av_free(s->chapters[s->nb_chapters]->title);
+        av_metadata_free(&s->chapters[s->nb_chapters]->metadata);
         av_free(s->chapters[s->nb_chapters]);
     }
     av_freep(&s->chapters);
+    av_metadata_free(&s->metadata);
     av_free(s);
 }
 
@@ -2492,6 +2501,10 @@ int av_write_header(AVFormatContext *s)
             return AVERROR(ENOMEM);
     }
 
+#if LIBAVFORMAT_VERSION_MAJOR < 53
+    ff_metadata_mux_compat(s);
+#endif
+
     if(s->oformat->write_header){
         ret = s->oformat->write_header(s);
         if (ret < 0)
@@ -2849,6 +2862,7 @@ void dump_format(AVFormatContext *ic,
         dump_stream_format(ic, i, index, is_output);
 }
 
+#if LIBAVFORMAT_VERSION_MAJOR < 53
 int parse_image_size(int *width_ptr, int *height_ptr, const char *str)
 {
     return av_parse_video_frame_size(width_ptr, height_ptr, str);
@@ -2862,6 +2876,7 @@ int parse_frame_rate(int *frame_rate_num, int *frame_rate_den, const char *arg)
     *frame_rate_den= frame_rate.den;
     return ret;
 }
+#endif
 
 int64_t av_gettime(void)
 {
diff --git a/libavformat/voc.h b/libavformat/voc.h
index 0e8aa61..7993146 100644
--- a/libavformat/voc.h
+++ b/libavformat/voc.h
@@ -27,7 +27,7 @@
 
 typedef struct voc_dec_context {
     int remaining_size;
-} voc_dec_context_t;
+} VocDecContext;
 
 typedef enum voc_type {
     VOC_TYPE_EOF              = 0x00,
@@ -40,7 +40,7 @@ typedef enum voc_type {
     VOC_TYPE_REPETITION_END   = 0x07,
     VOC_TYPE_EXTENDED         = 0x08,
     VOC_TYPE_NEW_VOICE_DATA   = 0x09,
-} voc_type_t;
+} VocType;
 
 extern const unsigned char ff_voc_magic[21];
 extern const AVCodecTag ff_voc_codec_tags[];
diff --git a/libavformat/vocdec.c b/libavformat/vocdec.c
index 7aec2ab..8ad6909 100644
--- a/libavformat/vocdec.c
+++ b/libavformat/vocdec.c
@@ -38,7 +38,7 @@ static int voc_probe(AVProbeData *p)
 
 static int voc_read_header(AVFormatContext *s, AVFormatParameters *ap)
 {
-    voc_dec_context_t *voc = s->priv_data;
+    VocDecContext *voc = s->priv_data;
     ByteIOContext *pb = s->pb;
     int header_size;
     AVStream *st;
@@ -62,10 +62,10 @@ static int voc_read_header(AVFormatContext *s, AVFormatParameters *ap)
 int
 voc_get_packet(AVFormatContext *s, AVPacket *pkt, AVStream *st, int max_size)
 {
-    voc_dec_context_t *voc = s->priv_data;
+    VocDecContext *voc = s->priv_data;
     AVCodecContext *dec = st->codec;
     ByteIOContext *pb = s->pb;
-    voc_type_t type;
+    VocType type;
     int size;
     int sample_rate = 0;
     int channels = 1;
@@ -137,7 +137,7 @@ static int voc_read_packet(AVFormatContext *s, AVPacket *pkt)
 AVInputFormat voc_demuxer = {
     "voc",
     NULL_IF_CONFIG_SMALL("Creative Voice file format"),
-    sizeof(voc_dec_context_t),
+    sizeof(VocDecContext),
     voc_probe,
     voc_read_header,
     voc_read_packet,
diff --git a/libavformat/vocenc.c b/libavformat/vocenc.c
index 4badb1d..744b233 100644
--- a/libavformat/vocenc.c
+++ b/libavformat/vocenc.c
@@ -24,7 +24,7 @@
 
 typedef struct voc_enc_context {
     int param_written;
-} voc_enc_context_t;
+} VocEncContext;
 
 static int voc_write_header(AVFormatContext *s)
 {
@@ -46,7 +46,7 @@ static int voc_write_header(AVFormatContext *s)
 
 static int voc_write_packet(AVFormatContext *s, AVPacket *pkt)
 {
-    voc_enc_context_t *voc = s->priv_data;
+    VocEncContext *voc = s->priv_data;
     AVCodecContext *enc = s->streams[0]->codec;
     ByteIOContext *pb = s->pb;
 
@@ -93,7 +93,7 @@ AVOutputFormat voc_muxer = {
     NULL_IF_CONFIG_SMALL("Creative Voice file format"),
     "audio/x-voc",
     "voc",
-    sizeof(voc_enc_context_t),
+    sizeof(VocEncContext),
     CODEC_ID_PCM_U8,
     CODEC_ID_NONE,
     voc_write_header,
diff --git a/libavutil/Makefile b/libavutil/Makefile
index bcbc9b3..be2f3e1 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -2,6 +2,23 @@ include $(SUBDIR)../config.mak
 
 NAME = avutil
 
+HEADERS = adler32.h                                                     \
+          avstring.h                                                    \
+          avutil.h                                                      \
+          base64.h                                                      \
+          common.h                                                      \
+          crc.h                                                         \
+          fifo.h                                                        \
+          intfloat_readwrite.h                                          \
+          log.h                                                         \
+          lzo.h                                                         \
+          mathematics.h                                                 \
+          md5.h                                                         \
+          mem.h                                                         \
+          random.h                                                      \
+          rational.h                                                    \
+          sha1.h
+
 OBJS = adler32.o                                                        \
        aes.o                                                            \
        base64.o                                                         \
@@ -24,23 +41,6 @@ OBJS = adler32.o                                                        \
        tree.o                                                           \
        utils.o                                                          \
 
-HEADERS = adler32.h                                                     \
-          avstring.h                                                    \
-          avutil.h                                                      \
-          base64.h                                                      \
-          common.h                                                      \
-          crc.h                                                         \
-          fifo.h                                                        \
-          intfloat_readwrite.h                                          \
-          log.h                                                         \
-          lzo.h                                                         \
-          mathematics.h                                                 \
-          md5.h                                                         \
-          mem.h                                                         \
-          random.h                                                      \
-          rational.h                                                    \
-          sha1.h
-
 TESTS = $(addsuffix -test$(EXESUF), adler32 aes crc des lls md5 pca random sha1 softfloat tree)
 
 include $(SUBDIR)../subdir.mak
diff --git a/libavutil/avutil.h b/libavutil/avutil.h
index a21a8a4..7366718 100644
--- a/libavutil/avutil.h
+++ b/libavutil/avutil.h
@@ -121,6 +121,7 @@ enum PixelFormat {
     PIX_FMT_YUV440P,   ///< Planar YUV 4:4:0 (1 Cr & Cb sample per 1x2 Y samples)
     PIX_FMT_YUVJ440P,  ///< Planar YUV 4:4:0 full scale (jpeg)
     PIX_FMT_YUVA420P,  ///< Planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples)
+    PIX_FMT_VDPAU_H264,///< H264 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
     PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
 };
 
diff --git a/libavutil/bswap.h b/libavutil/bswap.h
index c14676e..100ed1c 100644
--- a/libavutil/bswap.h
+++ b/libavutil/bswap.h
@@ -30,7 +30,7 @@
 #include "config.h"
 #include "common.h"
 
-#if defined(ARCH_ARMV4L)
+#if defined(ARCH_ARM)
 #   include "arm/bswap.h"
 #elif defined(ARCH_BFIN)
 #   include "bfin/bswap.h"
diff --git a/libavutil/common.h b/libavutil/common.h
index cd43abd..d66120f 100644
--- a/libavutil/common.h
+++ b/libavutil/common.h
@@ -41,8 +41,10 @@
 #    include <math.h>
 #endif /* HAVE_AV_CONFIG_H */
 
+#define AV_GCC_VERSION_AT_LEAST(x,y) (defined(__GNUC__) && (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y))
+
 #ifndef av_always_inline
-#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
+#if AV_GCC_VERSION_AT_LEAST(3,1)
 #    define av_always_inline __attribute__((always_inline)) inline
 #else
 #    define av_always_inline inline
@@ -50,7 +52,7 @@
 #endif
 
 #ifndef av_noinline
-#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
+#if AV_GCC_VERSION_AT_LEAST(3,1)
 #    define av_noinline __attribute__((noinline))
 #else
 #    define av_noinline
@@ -58,7 +60,7 @@
 #endif
 
 #ifndef av_pure
-#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
+#if AV_GCC_VERSION_AT_LEAST(3,1)
 #    define av_pure __attribute__((pure))
 #else
 #    define av_pure
@@ -66,7 +68,7 @@
 #endif
 
 #ifndef av_const
-#if defined(__GNUC__) && (__GNUC__ > 2 || __GNUC__ == 2 && __GNUC_MINOR__ > 5)
+#if AV_GCC_VERSION_AT_LEAST(2,6)
 #    define av_const __attribute__((const))
 #else
 #    define av_const
@@ -74,7 +76,7 @@
 #endif
 
 #ifndef av_cold
-#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 2)
+#if (!defined(__ICC) || __ICC > 1100) && AV_GCC_VERSION_AT_LEAST(4,3)
 #    define av_cold __attribute__((cold))
 #else
 #    define av_cold
@@ -86,7 +88,7 @@
 #endif /* HAVE_AV_CONFIG_H */
 
 #ifndef attribute_deprecated
-#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
+#if AV_GCC_VERSION_AT_LEAST(3,1)
 #    define attribute_deprecated __attribute__((deprecated))
 #else
 #    define attribute_deprecated
@@ -321,7 +323,7 @@ static inline av_pure int ff_get_fourcc(const char *s){
         }\
     }
 
-#if defined(ARCH_X86) || defined(ARCH_POWERPC) || defined(ARCH_BFIN)
+#if defined(ARCH_X86) || defined(ARCH_PPC) || defined(ARCH_BFIN)
 #define AV_READ_TIME read_time
 #if defined(ARCH_X86)
 static inline uint64_t read_time(void)
diff --git a/libavutil/internal.h b/libavutil/internal.h
index 5ade1af..251cc33 100644
--- a/libavutil/internal.h
+++ b/libavutil/internal.h
@@ -33,9 +33,10 @@
 #include <stdint.h>
 #include <stddef.h>
 #include <assert.h>
+#include "common.h"
 
 #ifndef attribute_align_arg
-#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__>1)
+#if (!defined(__ICC) || __ICC > 1100) && AV_GCC_VERSION_AT_LEAST(4,2)
 #    define attribute_align_arg __attribute__((force_align_arg_pointer))
 #else
 #    define attribute_align_arg
@@ -43,7 +44,7 @@
 #endif
 
 #ifndef attribute_used
-#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
+#if AV_GCC_VERSION_AT_LEAST(3,1)
 #    define attribute_used __attribute__((used))
 #else
 #    define attribute_used
@@ -148,17 +149,14 @@ static inline av_const int FASTDIV(int a, int b)
                      : "=&r"(r), "=&r"(t) : "r"(a), "r"(b), "r"(ff_inverse));
     return r;
 }
-#elif defined(ARCH_ARMV4L)
-#    define FASTDIV(a,b) \
-    ({\
-        int ret,dmy;\
-        __asm__ volatile(\
-            "umull %1, %0, %2, %3"\
-            :"=&r"(ret),"=&r"(dmy)\
-            :"r"(a),"r"(ff_inverse[b])\
-            );\
-        ret;\
-    })
+#elif defined(ARCH_ARM)
+static inline av_const int FASTDIV(int a, int b)
+{
+    int r, t;
+    __asm__ volatile ("umull %1, %0, %2, %3"
+                      : "=&r"(r), "=&r"(t) : "r"(a), "r"(ff_inverse[b]));
+    return r;
+}
 #elif defined(CONFIG_FASTDIV)
 #    define FASTDIV(a,b)   ((uint32_t)((((uint64_t)a)*ff_inverse[b])>>32))
 #else
diff --git a/libavutil/mem.c b/libavutil/mem.c
index 960074c..328bef7 100644
--- a/libavutil/mem.c
+++ b/libavutil/mem.c
@@ -31,6 +31,7 @@
 #undef free
 #undef realloc
 
+#include <stdlib.h>
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
@@ -41,7 +42,7 @@
 
 void *av_malloc(unsigned int size)
 {
-    void *ptr;
+    void *ptr = NULL;
 #ifdef CONFIG_MEMALIGN_HACK
     long diff;
 #endif
@@ -57,6 +58,8 @@ void *av_malloc(unsigned int size)
     diff= ((-(long)ptr - 1)&15) + 1;
     ptr = (char*)ptr + diff;
     ((char*)ptr)[-1]= diff;
+#elif defined (HAVE_POSIX_MEMALIGN)
+    posix_memalign(&ptr,16,size);
 #elif defined (HAVE_MEMALIGN)
     ptr = memalign(16,size);
     /* Why 64?
diff --git a/libavutil/mem.h b/libavutil/mem.h
index a02c7e1..3ea9858 100644
--- a/libavutil/mem.h
+++ b/libavutil/mem.h
@@ -26,6 +26,8 @@
 #ifndef AVUTIL_MEM_H
 #define AVUTIL_MEM_H
 
+#include "common.h"
+
 #if defined(__ICC) || defined(__SUNPRO_C)
     #define DECLARE_ALIGNED(n,t,v)      t v __attribute__ ((aligned (n)))
     #define DECLARE_ASM_CONST(n,t,v)    const t __attribute__ ((aligned (n))) v
@@ -42,13 +44,13 @@
     #define DECLARE_ASM_CONST(n,t,v)    static const t v
 #endif
 
-#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
+#if AV_GCC_VERSION_AT_LEAST(3,1)
     #define av_malloc_attrib __attribute__((__malloc__))
 #else
     #define av_malloc_attrib
 #endif
 
-#if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 2)
+#if (!defined(__ICC) || __ICC > 1100) && AV_GCC_VERSION_AT_LEAST(4,3)
     #define av_alloc_size(n) __attribute__((alloc_size(n)))
 #else
     #define av_alloc_size(n)
diff --git a/libavutil/pca.c b/libavutil/pca.c
index d21814c..76966f0 100644
--- a/libavutil/pca.c
+++ b/libavutil/pca.c
@@ -168,7 +168,7 @@ int ff_pca(PCA *pca, double *eigenvector, double *eigenvalue){
 #include <stdio.h>
 #include <stdlib.h>
 
-int main(){
+int main(void){
     PCA *pca;
     int i, j, k;
 #define LEN 8
diff --git a/libavutil/tree.c b/libavutil/tree.c
index 64653aa..008beb6 100644
--- a/libavutil/tree.c
+++ b/libavutil/tree.c
@@ -128,9 +128,11 @@ void *av_tree_insert(AVTreeNode **tp, void *key, int (*cmp)(void *key, const voi
 }
 
 void av_tree_destroy(AVTreeNode *t){
-    av_tree_destroy(t->child[0]);
-    av_tree_destroy(t->child[1]);
-    av_free(t);
+    if(t){
+        av_tree_destroy(t->child[0]);
+        av_tree_destroy(t->child[1]);
+        av_free(t);
+    }
 }
 
 #if 0
diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c
index 56de4f1..b2f5bad 100644
--- a/libpostproc/postprocess.c
+++ b/libpostproc/postprocess.c
@@ -79,9 +79,6 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
 //#undef HAVE_MMX2
 //#define HAVE_3DNOW
 //#undef HAVE_MMX
@@ -557,7 +554,7 @@ static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride,
 
 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 //Plain C versions
-#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
+#if !(defined (HAVE_MMX) || defined (HAVE_ALTIVEC)) || defined (RUNTIME_CPUDETECT)
 #define COMPILE_C
 #endif
 
@@ -634,7 +631,7 @@ static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride,
 // minor note: the HAVE_xyz is messed up after that line so do not use it.
 
 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
-        const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
+        const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
 {
     PPContext *c= (PPContext *)vc;
     PPMode *ppMode= (PPMode *)vm;
@@ -730,7 +727,7 @@ const char pp_help[] =
 "\n"
 ;
 
-pp_mode_t *pp_get_mode_by_name_and_quality(const char *name, int quality)
+pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
 {
     char temp[GET_MODE_BUFFER_SIZE];
     char *p= temp;
@@ -905,7 +902,7 @@ pp_mode_t *pp_get_mode_by_name_and_quality(const char *name, int quality)
     return ppMode;
 }
 
-void pp_free_mode(pp_mode_t *mode){
+void pp_free_mode(pp_mode *mode){
     av_free(mode);
 }
 
@@ -947,7 +944,7 @@ static const char * context_to_name(void * ptr) {
 
 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
 
-pp_context_t *pp_get_context(int width, int height, int cpuCaps){
+pp_context *pp_get_context(int width, int height, int cpuCaps){
     PPContext *c= av_malloc(sizeof(PPContext));
     int stride= (width+15)&(~15);    //assumed / will realloc if needed
     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
@@ -995,7 +992,7 @@ void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
                      uint8_t * dst[3], const int dstStride[3],
                      int width, int height,
                      const QP_STORE_T *QP_store,  int QPStride,
-                     pp_mode_t *vm,  void *vc, int pict_type)
+                     pp_mode *vm,  void *vc, int pict_type)
 {
     int mbWidth = (width+15)>>4;
     int mbHeight= (height+15)>>4;
diff --git a/libpostproc/postprocess.h b/libpostproc/postprocess.h
index 2a1d6d5..7e30da7 100644
--- a/libpostproc/postprocess.h
+++ b/libpostproc/postprocess.h
@@ -54,10 +54,12 @@ unsigned postproc_version(void);
 
 #include <inttypes.h>
 
-typedef void pp_context_t;
-typedef void pp_mode_t;
+typedef void pp_context;
+typedef void pp_mode;
 
 #if LIBPOSTPROC_VERSION_INT < (52<<16)
+typedef pp_context pp_context_t;
+typedef pp_mode pp_mode_t;
 extern const char *const pp_help; ///< a simple help text
 #else
 extern const char pp_help[]; ///< a simple help text
@@ -67,19 +69,19 @@ void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
                      uint8_t * dst[3], const int dstStride[3],
                      int horizontalSize, int verticalSize,
                      const QP_STORE_T *QP_store,  int QP_stride,
-                     pp_mode_t *mode, pp_context_t *ppContext, int pict_type);
+                     pp_mode *mode, pp_context *ppContext, int pict_type);
 
 
 /**
- * returns a pp_mode_t or NULL if an error occurred
+ * returns a pp_mode or NULL if an error occurred
  * name is the string after "-pp" on the command line
  * quality is a number from 0 to PP_QUALITY_MAX
  */
-pp_mode_t *pp_get_mode_by_name_and_quality(const char *name, int quality);
-void pp_free_mode(pp_mode_t *mode);
+pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality);
+void pp_free_mode(pp_mode *mode);
 
-pp_context_t *pp_get_context(int width, int height, int flags);
-void pp_free_context(pp_context_t *ppContext);
+pp_context *pp_get_context(int width, int height, int flags);
+void pp_free_context(pp_context *ppContext);
 
 #define PP_CPU_CAPS_MMX   0x80000000
 #define PP_CPU_CAPS_MMX2  0x20000000
diff --git a/libswscale/Makefile b/libswscale/Makefile
index 84675e0..a959661 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -3,6 +3,8 @@ include $(SUBDIR)../config.mak
 NAME = swscale
 FFLIBS = avutil
 
+HEADERS = swscale.h
+
 OBJS = rgb2rgb.o swscale.o swscale_avoption.o
 
 OBJS-$(ARCH_BFIN)          +=  internal_bfin.o swscale_bfin.o yuv2rgb_bfin.o
@@ -11,8 +13,6 @@ OBJS-$(CONFIG_MLIB)        +=  yuv2rgb_mlib.o
 OBJS-$(HAVE_ALTIVEC)       +=  yuv2rgb_altivec.o
 OBJS-$(HAVE_VIS)           +=  yuv2rgb_vis.o
 
-HEADERS = swscale.h
-
 TESTS = cs_test swscale-example
 
 CLEANFILES = cs_test swscale-example
diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index b551412..ac452a0 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -47,12 +47,10 @@ void (*rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size);
-//void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
-//void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
 
@@ -124,21 +122,6 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
 DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
 DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
 DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
-
-#if 0
-static volatile uint64_t __attribute__((aligned(8))) b5Dither;
-static volatile uint64_t __attribute__((aligned(8))) g5Dither;
-static volatile uint64_t __attribute__((aligned(8))) g6Dither;
-static volatile uint64_t __attribute__((aligned(8))) r5Dither;
-
-static uint64_t __attribute__((aligned(8))) dither4[2]={
-    0x0103010301030103LL,
-    0x0200020002000200LL,};
-
-static uint64_t __attribute__((aligned(8))) dither8[2]={
-    0x0602060206020602LL,
-    0x0004000400040004LL,};
-#endif
 #endif /* defined(ARCH_X86) */
 
 #define RGB2YUV_SHIFT 8
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 5cc6a2d..df912c8 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -48,25 +48,25 @@ extern void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
 
-extern void rgb24to32   (const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb32to24   (const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb16to24   (const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb15to24   (const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
-extern void bgr8torgb8  (const uint8_t *src, uint8_t *dst, long src_size);
-
-
-extern void palette8topacked32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8topacked24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void rgb24to32   (const uint8_t *src, uint8_t *dst, long src_size);
+void rgb32to24   (const uint8_t *src, uint8_t *dst, long src_size);
+void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
+void rgb16to24   (const uint8_t *src, uint8_t *dst, long src_size);
+void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
+void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
+void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
+void rgb15to24   (const uint8_t *src, uint8_t *dst, long src_size);
+void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
+void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
+void bgr8torgb8  (const uint8_t *src, uint8_t *dst, long src_size);
+
+
+void palette8topacked32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8topacked24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
 
 /**
  * Height should be a multiple of 2 and width should be a multiple of 16.
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index ab82035..ce76c5e 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -2016,14 +2016,14 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
     {
 #ifdef HAVE_MMX
         __asm__ volatile(
-        "xorl                %%eax, %%eax   \n\t"
+        "xor                 %%"REG_a", %%"REG_a"   \n\t"
         "pcmpeqw             %%mm7, %%mm7   \n\t"
         "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
         ASMALIGN(4)
         "1:                                 \n\t"
-        PREFETCH" 64(%0, %%eax, 4)          \n\t"
-        "movq       (%0, %%eax, 4), %%mm0   \n\t" // UYVY UYVY(0)
-        "movq      8(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(4)
+        PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
+        "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
+        "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
         "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
         "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
         "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
@@ -2033,10 +2033,10 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
         "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
 
-        MOVNTQ"              %%mm2,  (%1, %%eax, 2) \n\t"
+        MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
 
-        "movq     16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
-        "movq     24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
+        "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
+        "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
         "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
         "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
         "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
@@ -2046,7 +2046,7 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
         "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
 
-        MOVNTQ"              %%mm3, 8(%1, %%eax, 2) \n\t"
+        MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
 
         "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
         "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
@@ -2057,28 +2057,28 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
         "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
 
-        MOVNTQ"              %%mm0, (%3, %%eax) \n\t"
-        MOVNTQ"              %%mm2, (%2, %%eax) \n\t"
+        MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
+        MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
 
-        "addl                   $8, %%eax   \n\t"
-        "cmpl                   %4, %%eax   \n\t"
+        "add                    $8, %%"REG_a"   \n\t"
+        "cmp                    %4, %%"REG_a"   \n\t"
         " jb                    1b          \n\t"
         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
-        : "memory", "%eax"
+        : "memory", "%"REG_a
         );
 
         ydst += lumStride;
         src  += srcStride;
 
         __asm__ volatile(
-        "xorl                %%eax, %%eax   \n\t"
+        "xor                 %%"REG_a", %%"REG_a"   \n\t"
         ASMALIGN(4)
         "1:                                 \n\t"
-        PREFETCH" 64(%0, %%eax, 4)          \n\t"
-        "movq       (%0, %%eax, 4), %%mm0   \n\t" // YUYV YUYV(0)
-        "movq      8(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(4)
-        "movq     16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
-        "movq     24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
+        PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
+        "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
+        "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
+        "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
+        "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
         "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
         "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
         "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
@@ -2086,15 +2086,15 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
         "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
 
-        MOVNTQ"              %%mm0,  (%1, %%eax, 2) \n\t"
-        MOVNTQ"              %%mm2, 8(%1, %%eax, 2) \n\t"
+        MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
+        MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
 
-        "addl                   $8, %%eax   \n\t"
-        "cmpl                   %4, %%eax   \n\t"
+        "add                    $8, %%"REG_a"   \n\t"
+        "cmp                    %4, %%"REG_a"   \n\t"
         " jb                    1b          \n\t"
 
         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
-        : "memory", "%eax"
+        : "memory", "%"REG_a
         );
 #else
         long i;
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 577d093..cfa3580 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -452,6 +452,8 @@ const char *sws_format_name(enum PixelFormat format)
             return "nv21";
         case PIX_FMT_YUV440P:
             return "yuv440p";
+        case PIX_FMT_VDPAU_H264:
+            return "vdpau_h264";
         default:
             return "Unknown format";
     }
@@ -949,11 +951,11 @@ static inline void yuv2rgbXinC_full(SwsContext *c, int16_t *lumFilter, int16_t *
 #define COMPILE_C
 #endif
 
-#ifdef ARCH_POWERPC
+#ifdef ARCH_PPC
 #if (defined (HAVE_ALTIVEC) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL)
 #define COMPILE_ALTIVEC
 #endif //HAVE_ALTIVEC
-#endif //ARCH_POWERPC
+#endif //ARCH_PPC
 
 #if defined(ARCH_X86)
 
@@ -1628,7 +1630,7 @@ static SwsFunc getSwsFunc(int flags){
         return swScale_C;
 
 #else
-#ifdef ARCH_POWERPC
+#ifdef ARCH_PPC
     if (flags & SWS_CPU_CAPS_ALTIVEC)
         return swScale_altivec;
     else
diff --git a/libswscale/swscale_bfin.c b/libswscale/swscale_bfin.c
index 3e63bbd..ed7d957 100644
--- a/libswscale/swscale_bfin.c
+++ b/libswscale/swscale_bfin.c
@@ -26,9 +26,6 @@
 #include <inttypes.h>
 #include <assert.h>
 #include "config.h"
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
 #include <unistd.h>
 #include "rgb2rgb.h"
 #include "swscale.h"
@@ -40,13 +37,13 @@
 #define L1CODE
 #endif
 
-extern int ff_bfin_uyvytoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-                               long width, long height,
-                               long lumStride, long chromStride, long srcStride) L1CODE;
+int ff_bfin_uyvytoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                        long width, long height,
+                        long lumStride, long chromStride, long srcStride) L1CODE;
 
-extern int ff_bfin_yuyvtoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-                               long width, long height,
-                               long lumStride, long chromStride, long srcStride) L1CODE;
+int ff_bfin_yuyvtoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                        long width, long height,
+                        long lumStride, long chromStride, long srcStride) L1CODE;
 
 static int uyvytoyv12_unscaled (SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                 int srcSliceH, uint8_t* dst[], int dstStride[])
diff --git a/libswscale/swscale_template.c b/libswscale/swscale_template.c
index 5754c10..4c8bc6e 100644
--- a/libswscale/swscale_template.c
+++ b/libswscale/swscale_template.c
@@ -1826,11 +1826,12 @@ static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint
 
 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
 {
-    int i;
-    assert(src1==src2);
 #ifdef HAVE_MMX
+    assert(src1==src2);
     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
 #else
+    int i;
+    assert(src1==src2);
     for (i=0; i<width; i++)
     {
         int r= src1[3*i + 0];
diff --git a/libswscale/yuv2rgb_altivec.c b/libswscale/yuv2rgb_altivec.c
index 0223fdd..baffbc8 100644
--- a/libswscale/yuv2rgb_altivec.c
+++ b/libswscale/yuv2rgb_altivec.c
@@ -91,9 +91,6 @@ adjustment.
 #include <inttypes.h>
 #include <assert.h>
 #include "config.h"
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"
diff --git a/libswscale/yuv2rgb_bfin.c b/libswscale/yuv2rgb_bfin.c
index 1500a96..58cc5b6 100644
--- a/libswscale/yuv2rgb_bfin.c
+++ b/libswscale/yuv2rgb_bfin.c
@@ -27,9 +27,6 @@
 #include <inttypes.h>
 #include <assert.h>
 #include "config.h"
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
 #include <unistd.h>
 #include "rgb2rgb.h"
 #include "swscale.h"
@@ -41,17 +38,17 @@
 #define L1CODE
 #endif
 
-extern void ff_bfin_yuv2rgb555_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                                     int w, uint32_t *coeffs) L1CODE;
+void ff_bfin_yuv2rgb555_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                              int w, uint32_t *coeffs) L1CODE;
 
-extern void ff_bfin_yuv2rgb565_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                                     int w, uint32_t *coeffs) L1CODE;
+void ff_bfin_yuv2rgb565_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                              int w, uint32_t *coeffs) L1CODE;
 
-extern void ff_bfin_yuv2rgb24_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                                    int w, uint32_t *coeffs) L1CODE;
+void ff_bfin_yuv2rgb24_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                             int w, uint32_t *coeffs) L1CODE;
 
-typedef void (* ltransform_t)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                              int w, uint32_t *coeffs);
+typedef void (* ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                            int w, uint32_t *coeffs);
 
 
 static void bfin_prepare_coefficients (SwsContext *c, int rgb, int masks)
@@ -95,7 +92,7 @@ static int core_yuv420_rgb (SwsContext *c,
                             uint8_t **in, int *instrides,
                             int srcSliceY, int srcSliceH,
                             uint8_t **oplanes, int *outstrides,
-                            ltransform_t lcscf, int rgb, int masks)
+                            ltransform lcscf, int rgb, int masks)
 {
     uint8_t *py,*pu,*pv,*op;
     int w  = instrides[0];
diff --git a/subdir.mak b/subdir.mak
index 7c38b77..df5bb8a 100644
--- a/subdir.mak
+++ b/subdir.mak
@@ -29,7 +29,7 @@ $(SUBDIR)$(SLIBNAME): $(SUBDIR)$(SLIBNAME_WITH_MAJOR)
 
 $(SUBDIR)$(SLIBNAME_WITH_MAJOR): $(OBJS)
 	$(SLIB_CREATE_DEF_CMD)
-	$(CC) $(SHFLAGS) $(FFLDFLAGS) -o $$@ $$^ $(FFEXTRALIBS) $(EXTRAOBJS)
+	$(CC) $(SHFLAGS) $(FFLDFLAGS) -o $$@ $$(filter-out $(DEP_LIBS),$$^) $(FFEXTRALIBS) $(EXTRAOBJS)
 	$(SLIB_EXTRA_CMD)
 
 ifdef SUBDIR
diff --git a/tests/regression.sh b/tests/regression.sh
index 24b42da..529a703 100755
--- a/tests/regression.sh
+++ b/tests/regression.sh
@@ -7,7 +7,11 @@
 
 set -e
 
+target_exec=$5
+target_path=$6
+
 datadir="./tests/data"
+target_datadir="${target_path}/${datadir}"
 
 test="${1#regtest-}"
 this="$test.$2"
@@ -17,21 +21,24 @@ outfile="$datadir/$4-"
 eval do_$test=y
 
 # various files
-ffmpeg="./ffmpeg_g"
+ffmpeg="$target_exec ${target_path}/ffmpeg_g"
 tiny_psnr="tests/tiny_psnr"
 benchfile="$datadir/$this.bench"
 bench="$datadir/$this.bench.tmp"
 bench2="$datadir/$this.bench2.tmp"
-raw_src="$3/%02d.pgm"
+raw_src="${target_path}/$3/%02d.pgm"
 raw_dst="$datadir/$this.out.yuv"
 raw_ref="$datadir/$2.ref.yuv"
-pcm_src="tests/asynth1.sw"
+pcm_src="${target_path}/tests/asynth1.sw"
 pcm_dst="$datadir/$this.out.wav"
 pcm_ref="$datadir/$2.ref.wav"
 crcfile="$datadir/$this.crc"
+target_crcfile="$target_datadir/$this.crc"
 
 if [ X"`echo | md5sum 2> /dev/null`" != X ]; then
     do_md5sum() { md5sum -b $1; }
+elif [ X"`echo | md5 2> /dev/null`" != X ]; then
+    do_md5sum() { md5 -r $1 | sed 's# \**\./# *./#'; }
 elif [ -x /sbin/md5 ]; then
     do_md5sum() { /sbin/md5 -r $1 | sed 's# \**\./# *./#'; }
 else
@@ -47,6 +54,7 @@ do_ffmpeg()
 {
     f="$1"
     shift
+    set -- $* ${target_path}/$f
     echo $ffmpeg $FFMPEG_OPTS $*
     $ffmpeg $FFMPEG_OPTS -benchmark $* > $bench 2> /tmp/ffmpeg$$
     egrep -v "^(Stream|Press|Input|Output|frame|  Stream|  Duration|video:)" /tmp/ffmpeg$$ || true
@@ -67,6 +75,7 @@ do_ffmpeg_nomd5()
 {
     f="$1"
     shift
+    set -- $* ${target_path}/$f
     echo $ffmpeg $FFMPEG_OPTS $*
     $ffmpeg $FFMPEG_OPTS -benchmark $* > $bench 2> /tmp/ffmpeg$$
     egrep -v "^(Stream|Press|Input|Output|frame|  Stream|  Duration|video:)" /tmp/ffmpeg$$ || true
@@ -86,8 +95,8 @@ do_ffmpeg_crc()
 {
     f="$1"
     shift
-    echo $ffmpeg $FFMPEG_OPTS $* -f crc "$crcfile"
-    $ffmpeg $FFMPEG_OPTS $* -f crc "$crcfile" > /tmp/ffmpeg$$ 2>&1
+    echo $ffmpeg $FFMPEG_OPTS $* -f crc "$target_crcfile"
+    $ffmpeg $FFMPEG_OPTS $* -f crc "$target_crcfile" > /tmp/ffmpeg$$ 2>&1
     egrep -v "^(Stream|Press|Input|Output|frame|  Stream|  Duration|video:|ffmpeg version|  configuration|  built)" /tmp/ffmpeg$$ || true
     rm -f /tmp/ffmpeg$$
     echo "$f `cat $crcfile`" >> $logfile
@@ -108,55 +117,55 @@ do_ffmpeg_nocheck()
 
 do_video_decoding()
 {
-    do_ffmpeg $raw_dst $1 -i $file -f rawvideo $2 $raw_dst
+    do_ffmpeg $raw_dst $1 -i $target_path/$file -f rawvideo $2
     rm -f $raw_dst
 }
 
 do_video_encoding()
 {
     file=${outfile}$1
-    do_ffmpeg $file $2 -f image2 -vcodec pgmyuv -i $raw_src $3 $file
+    do_ffmpeg $file $2 -f image2 -vcodec pgmyuv -i $raw_src $3
 }
 
 do_audio_encoding()
 {
     file=${outfile}$1
-    do_ffmpeg $file -ab 128k -ac 2 -f s16le -i $pcm_src $3 $file
+    do_ffmpeg $file -ab 128k -ac 2 -f s16le -i $pcm_src $3
 }
 
 do_audio_decoding()
 {
-    do_ffmpeg $pcm_dst -i $file -sample_fmt s16 -f wav $pcm_dst
+    do_ffmpeg $pcm_dst -i $target_path/$file -sample_fmt s16 -f wav
 }
 
 do_libav()
 {
     file=${outfile}libav.$1
-    do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src -f s16le -i $pcm_src $2 $file
-    do_ffmpeg_crc $file -i $file $3
+    do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src -f s16le -i $pcm_src $2
+    do_ffmpeg_crc $file -i $target_path/$file $3
 }
 
 do_streamed_images()
 {
     file=${outfile}${1}pipe.$1
-    do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src -f image2pipe $file
-    do_ffmpeg_crc $file -f image2pipe -i $file
+    do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src -f image2pipe
+    do_ffmpeg_crc $file -f image2pipe -i $target_path/$file
 }
 
 do_image_formats()
 {
     file=${outfile}libav%02d.$1
-    $ffmpeg -t 0.5 -y -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src $2 $3 -flags +bitexact -sws_flags +accurate_rnd+bitexact $file
+    $ffmpeg -t 0.5 -y -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src $2 $3 -flags +bitexact -sws_flags +accurate_rnd+bitexact $target_path/$file
     do_md5sum ${outfile}libav02.$1 >> $logfile
-    do_ffmpeg_crc $file $3 -i $file
+    do_ffmpeg_crc $file $3 -i $target_path/$file
     wc -c ${outfile}libav02.$1 >> $logfile
 }
 
 do_audio_only()
 {
     file=${outfile}libav.$1
-    do_ffmpeg $file -t 1 -qscale 10 -f s16le -i $pcm_src $file
-    do_ffmpeg_crc $file -i $file
+    do_ffmpeg $file -t 1 -qscale 10 -f s16le -i $pcm_src
+    do_ffmpeg_crc $file -i $target_path/$file
 }
 
 rm -f "$logfile"
@@ -164,8 +173,8 @@ rm -f "$benchfile"
 
 # generate reference for quality check
 if [ -n "$do_ref" ]; then
-do_ffmpeg_nocheck $raw_ref -f image2 -vcodec pgmyuv -i $raw_src -an -f rawvideo $raw_ref
-do_ffmpeg_nocheck $pcm_ref -ab 128k -ac 2 -ar 44100 -f s16le -i $pcm_src -f wav $pcm_ref
+do_ffmpeg_nocheck $raw_ref -f image2 -vcodec pgmyuv -i $raw_src -an -f rawvideo $target_path/$raw_ref
+do_ffmpeg_nocheck $pcm_ref -ab 128k -ac 2 -ar 44100 -f s16le -i $pcm_src -f wav $target_path/$pcm_ref
 fi
 
 if [ -n "$do_mpeg" ] ; then
@@ -207,7 +216,7 @@ do_video_decoding
 
 # mpeg2 encoding interlaced
 file=${outfile}mpeg2reuse.mpg
-do_ffmpeg $file -sameq -me_threshold 256 -mb_threshold 1024 -i ${outfile}mpeg2thread.mpg -vcodec mpeg2video -f mpeg1video -bf 2 -flags +ildct+ilme -threads 4 $file
+do_ffmpeg $file -sameq -me_threshold 256 -mb_threshold 1024 -i ${target_path}/${outfile}mpeg2thread.mpg -vcodec mpeg2video -f mpeg1video -bf 2 -flags +ildct+ilme -threads 4
 do_video_decoding
 fi
 
@@ -424,11 +433,11 @@ fi
 if [ -n "$do_wma" ] ; then
 # wmav1
 do_audio_encoding wmav1.asf "-ar 44100" "-acodec wmav1"
-do_ffmpeg_nomd5 $pcm_dst -i $file -f wav $pcm_dst
+do_ffmpeg_nomd5 $pcm_dst -i $target_path/$file -f wav
 $tiny_psnr $pcm_dst $pcm_ref 2 8192 >> $logfile
 # wmav2
 do_audio_encoding wmav2.asf "-ar 44100" "-acodec wmav2"
-do_ffmpeg_nomd5 $pcm_dst -i $file -f wav $pcm_dst
+do_ffmpeg_nomd5 $pcm_dst -i $target_path/$file -f wav
 $tiny_psnr $pcm_dst $pcm_ref 2 8192 >> $logfile
 fi
 
@@ -481,9 +490,9 @@ fi
 
 if [ -n "$do_rm" ] ; then
 file=${outfile}libav.rm
-do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src -f s16le -i $pcm_src $file
+do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src -f s16le -i $pcm_src
 # broken
-#do_ffmpeg_crc $file -i $file
+#do_ffmpeg_crc $file -i $target_path/$file
 fi
 
 if [ -n "$do_mpg" ] ; then
@@ -530,8 +539,8 @@ fi
 # streamed images
 # mjpeg
 #file=${outfile}libav.mjpeg
-#do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src $file
-#do_ffmpeg_crc $file -i $file
+#do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src
+#do_ffmpeg_crc $file -i $target_path/$file
 
 if [ -n "$do_pbmpipe" ] ; then
 do_streamed_images pbm
@@ -547,14 +556,14 @@ fi
 
 if [ -n "$do_gif" ] ; then
 file=${outfile}libav.gif
-do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src -pix_fmt rgb24 $file
-#do_ffmpeg_crc $file -i $file
+do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src -pix_fmt rgb24
+#do_ffmpeg_crc $file -i $target_path/$file
 fi
 
 if [ -n "$do_yuv4mpeg" ] ; then
 file=${outfile}libav.y4m
-do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src $file
-#do_ffmpeg_crc $file -i $file
+do_ffmpeg $file -t 1 -qscale 10 -f image2 -vcodec pgmyuv -i $raw_src
+#do_ffmpeg_crc $file -i $target_path/$file
 fi
 
 # image formats
@@ -630,9 +639,9 @@ conversions="yuv420p yuv422p yuv444p yuyv422 yuv410p yuv411p yuvj420p \
 for pix_fmt in $conversions ; do
     file=${outfile}libav-${pix_fmt}.yuv
     do_ffmpeg_nocheck $file -r 1 -t 1 -f image2 -vcodec pgmyuv -i $raw_src \
-                            -f rawvideo -s 352x288 -pix_fmt $pix_fmt $raw_dst
-    do_ffmpeg $file -f rawvideo -s 352x288 -pix_fmt $pix_fmt -i $raw_dst \
-                    -f rawvideo -s 352x288 -pix_fmt yuv444p $file
+                            -f rawvideo -s 352x288 -pix_fmt $pix_fmt $target_path/$raw_dst
+    do_ffmpeg $file -f rawvideo -s 352x288 -pix_fmt $pix_fmt -i $target_path/$raw_dst \
+                    -f rawvideo -s 352x288 -pix_fmt yuv444p
 done
 fi
 
diff --git a/tests/seek.regression.ref b/tests/seek.regression.ref
index a38f593..90181f8 100644
--- a/tests/seek.regression.ref
+++ b/tests/seek.regression.ref
@@ -1532,99 +1532,99 @@ ret: 0 st: 0 dts:0.880000 pts:-102481911520608.625000 pos:79182 size:10791 flags
 ret:-1 st:-1 ts:-0.645825 flags:1
 ----------------
 tests/data/a-mpeg4-Q.avi
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:11942 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:11942 flags:1
 ret: 0 st:-1 ts:-1.000000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:11942 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:11942 flags:1
 ret: 0 st:-1 ts:1.894167 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:142978 size:15562 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:142978 size:15562 flags:1
 ret: 0 st: 0 ts:0.800000 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:64442 size:13382 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:64442 size:13382 flags:1
 ret:-1 st: 0 ts:-0.320000 flags:1
 ret:-1 st:-1 ts:2.576668 flags:0
 ret: 0 st:-1 ts:1.470835 flags:1
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:101236 size:15057 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:101236 size:15057 flags:1
 ret: 0 st: 0 ts:0.360000 flags:0
-ret: 0 st: 0 dts:0.400000 pts:0.400000 pos:32850 size:11813 flags:1
+ret: 0 st: 0 dts:0.400000 pts:-368934881474191040.000000 pos:32850 size:11813 flags:1
 ret:-1 st: 0 ts:-0.760000 flags:1
 ret:-1 st:-1 ts:2.153336 flags:0
 ret: 0 st:-1 ts:1.047503 flags:1
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:64442 size:13382 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:64442 size:13382 flags:1
 ret: 0 st: 0 ts:-0.040000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:11942 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:11942 flags:1
 ret: 0 st: 0 ts:2.840000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:142978 size:15562 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:142978 size:15562 flags:1
 ret: 0 st:-1 ts:1.730004 flags:0
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:142978 size:15562 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:142978 size:15562 flags:1
 ret: 0 st:-1 ts:0.624171 flags:1
-ret: 0 st: 0 dts:0.400000 pts:0.400000 pos:32850 size:11813 flags:1
+ret: 0 st: 0 dts:0.400000 pts:-368934881474191040.000000 pos:32850 size:11813 flags:1
 ret: 0 st: 0 ts:-0.480000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:11942 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:11942 flags:1
 ret: 0 st: 0 ts:2.400000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:142978 size:15562 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:142978 size:15562 flags:1
 ret: 0 st:-1 ts:1.306672 flags:0
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:101236 size:15057 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:101236 size:15057 flags:1
 ret: 0 st:-1 ts:0.200839 flags:1
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:11942 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:11942 flags:1
 ret: 0 st: 0 ts:-0.920000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:11942 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:11942 flags:1
 ret: 0 st: 0 ts:2.000000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:142978 size:15562 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:142978 size:15562 flags:1
 ret: 0 st:-1 ts:0.883340 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:64442 size:13382 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:64442 size:13382 flags:1
 ret:-1 st:-1 ts:-0.222493 flags:1
 ret:-1 st: 0 ts:2.680000 flags:0
 ret: 0 st: 0 ts:1.560000 flags:1
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:101236 size:15057 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:101236 size:15057 flags:1
 ret: 0 st:-1 ts:0.460008 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:64442 size:13382 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:64442 size:13382 flags:1
 ret:-1 st:-1 ts:-0.645825 flags:1
 ----------------
 tests/data/a-mpeg4-adap.avi
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:6855 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:6855 flags:1
 ret: 0 st:-1 ts:-1.000000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:6855 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:6855 flags:1
 ret: 0 st:-1 ts:1.894167 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:175668 size:16884 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:175668 size:16884 flags:1
 ret: 0 st: 0 ts:0.800000 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:99006 size:16933 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:99006 size:16933 flags:1
 ret:-1 st: 0 ts:-0.320000 flags:1
 ret:-1 st:-1 ts:2.576668 flags:0
 ret: 0 st:-1 ts:1.470835 flags:1
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:136592 size:17435 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:136592 size:17435 flags:1
 ret: 0 st: 0 ts:0.360000 flags:0
-ret: 0 st: 0 dts:0.400000 pts:0.400000 pos:59872 size:17261 flags:1
+ret: 0 st: 0 dts:0.400000 pts:-368934881474191040.000000 pos:59872 size:17261 flags:1
 ret:-1 st: 0 ts:-0.760000 flags:1
 ret:-1 st:-1 ts:2.153336 flags:0
 ret: 0 st:-1 ts:1.047503 flags:1
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:99006 size:16933 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:99006 size:16933 flags:1
 ret: 0 st: 0 ts:-0.040000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:6855 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:6855 flags:1
 ret: 0 st: 0 ts:2.840000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:175668 size:16884 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:175668 size:16884 flags:1
 ret: 0 st:-1 ts:1.730004 flags:0
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:175668 size:16884 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:175668 size:16884 flags:1
 ret: 0 st:-1 ts:0.624171 flags:1
-ret: 0 st: 0 dts:0.400000 pts:0.400000 pos:59872 size:17261 flags:1
+ret: 0 st: 0 dts:0.400000 pts:-368934881474191040.000000 pos:59872 size:17261 flags:1
 ret: 0 st: 0 ts:-0.480000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:6855 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:6855 flags:1
 ret: 0 st: 0 ts:2.400000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:175668 size:16884 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:175668 size:16884 flags:1
 ret: 0 st:-1 ts:1.306672 flags:0
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:136592 size:17435 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:136592 size:17435 flags:1
 ret: 0 st:-1 ts:0.200839 flags:1
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:6855 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:6855 flags:1
 ret: 0 st: 0 ts:-0.920000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:6855 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:6855 flags:1
 ret: 0 st: 0 ts:2.000000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:175668 size:16884 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:175668 size:16884 flags:1
 ret: 0 st:-1 ts:0.883340 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:99006 size:16933 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:99006 size:16933 flags:1
 ret:-1 st:-1 ts:-0.222493 flags:1
 ret:-1 st: 0 ts:2.680000 flags:0
 ret: 0 st: 0 ts:1.560000 flags:1
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:136592 size:17435 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:136592 size:17435 flags:1
 ret: 0 st:-1 ts:0.460008 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:99006 size:16933 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:99006 size:16933 flags:1
 ret:-1 st:-1 ts:-0.645825 flags:1
 ----------------
 tests/data/a-mpeg4-adv.avi
@@ -1724,147 +1724,147 @@ ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:35312 size:9987 flags:1
 ret:-1 st:-1 ts:-0.645825 flags:1
 ----------------
 tests/data/a-mpeg4-qprd.avi
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14873 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14873 flags:1
 ret: 0 st:-1 ts:-1.000000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14873 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14873 flags:1
 ret: 0 st:-1 ts:1.894167 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:212738 size:14347 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:212738 size:14347 flags:1
 ret: 0 st: 0 ts:0.800000 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:152640 size:14348 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:152640 size:14348 flags:1
 ret:-1 st: 0 ts:-0.320000 flags:1
 ret:-1 st:-1 ts:2.576668 flags:0
 ret: 0 st:-1 ts:1.470835 flags:1
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:182580 size:14281 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:182580 size:14281 flags:1
 ret: 0 st: 0 ts:0.360000 flags:0
-ret: 0 st: 0 dts:0.400000 pts:0.400000 pos:94070 size:29366 flags:1
+ret: 0 st: 0 dts:0.400000 pts:-368934881474191040.000000 pos:94070 size:29366 flags:1
 ret:-1 st: 0 ts:-0.760000 flags:1
 ret:-1 st:-1 ts:2.153336 flags:0
 ret: 0 st:-1 ts:1.047503 flags:1
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:152640 size:14348 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:152640 size:14348 flags:1
 ret: 0 st: 0 ts:-0.040000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14873 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14873 flags:1
 ret: 0 st: 0 ts:2.840000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:212738 size:14347 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:212738 size:14347 flags:1
 ret: 0 st:-1 ts:1.730004 flags:0
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:212738 size:14347 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:212738 size:14347 flags:1
 ret: 0 st:-1 ts:0.624171 flags:1
-ret: 0 st: 0 dts:0.400000 pts:0.400000 pos:94070 size:29366 flags:1
+ret: 0 st: 0 dts:0.400000 pts:-368934881474191040.000000 pos:94070 size:29366 flags:1
 ret: 0 st: 0 ts:-0.480000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14873 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14873 flags:1
 ret: 0 st: 0 ts:2.400000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:212738 size:14347 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:212738 size:14347 flags:1
 ret: 0 st:-1 ts:1.306672 flags:0
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:182580 size:14281 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:182580 size:14281 flags:1
 ret: 0 st:-1 ts:0.200839 flags:1
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14873 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14873 flags:1
 ret: 0 st: 0 ts:-0.920000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14873 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14873 flags:1
 ret: 0 st: 0 ts:2.000000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:212738 size:14347 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:212738 size:14347 flags:1
 ret: 0 st:-1 ts:0.883340 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:152640 size:14348 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:152640 size:14348 flags:1
 ret:-1 st:-1 ts:-0.222493 flags:1
 ret:-1 st: 0 ts:2.680000 flags:0
 ret: 0 st: 0 ts:1.560000 flags:1
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:182580 size:14281 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:182580 size:14281 flags:1
 ret: 0 st:-1 ts:0.460008 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:152640 size:14348 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:152640 size:14348 flags:1
 ret:-1 st:-1 ts:-0.645825 flags:1
 ----------------
 tests/data/a-mpeg4-rc.avi
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:15766 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:15766 flags:1
 ret: 0 st:-1 ts:-1.000000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:15766 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:15766 flags:1
 ret: 0 st:-1 ts:1.894167 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:209236 size:13826 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:209236 size:13826 flags:1
 ret: 0 st: 0 ts:0.800000 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:154792 size:13382 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:154792 size:13382 flags:1
 ret:-1 st: 0 ts:-0.320000 flags:1
 ret:-1 st:-1 ts:2.576668 flags:0
 ret: 0 st:-1 ts:1.470835 flags:1
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:182316 size:13326 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:182316 size:13326 flags:1
 ret: 0 st: 0 ts:0.360000 flags:0
-ret: 0 st: 0 dts:0.400000 pts:0.400000 pos:95408 size:32807 flags:1
+ret: 0 st: 0 dts:0.400000 pts:-368934881474191040.000000 pos:95408 size:32807 flags:1
 ret:-1 st: 0 ts:-0.760000 flags:1
 ret:-1 st:-1 ts:2.153336 flags:0
 ret: 0 st:-1 ts:1.047503 flags:1
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:154792 size:13382 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:154792 size:13382 flags:1
 ret: 0 st: 0 ts:-0.040000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:15766 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:15766 flags:1
 ret: 0 st: 0 ts:2.840000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:209236 size:13826 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:209236 size:13826 flags:1
 ret: 0 st:-1 ts:1.730004 flags:0
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:209236 size:13826 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:209236 size:13826 flags:1
 ret: 0 st:-1 ts:0.624171 flags:1
-ret: 0 st: 0 dts:0.400000 pts:0.400000 pos:95408 size:32807 flags:1
+ret: 0 st: 0 dts:0.400000 pts:-368934881474191040.000000 pos:95408 size:32807 flags:1
 ret: 0 st: 0 ts:-0.480000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:15766 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:15766 flags:1
 ret: 0 st: 0 ts:2.400000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:209236 size:13826 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:209236 size:13826 flags:1
 ret: 0 st:-1 ts:1.306672 flags:0
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:182316 size:13326 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:182316 size:13326 flags:1
 ret: 0 st:-1 ts:0.200839 flags:1
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:15766 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:15766 flags:1
 ret: 0 st: 0 ts:-0.920000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:15766 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:15766 flags:1
 ret: 0 st: 0 ts:2.000000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:209236 size:13826 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:209236 size:13826 flags:1
 ret: 0 st:-1 ts:0.883340 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:154792 size:13382 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:154792 size:13382 flags:1
 ret:-1 st:-1 ts:-0.222493 flags:1
 ret:-1 st: 0 ts:2.680000 flags:0
 ret: 0 st: 0 ts:1.560000 flags:1
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:182316 size:13326 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:182316 size:13326 flags:1
 ret: 0 st:-1 ts:0.460008 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:154792 size:13382 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:154792 size:13382 flags:1
 ret:-1 st:-1 ts:-0.645825 flags:1
 ----------------
 tests/data/a-mpeg4-thread.avi
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14874 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14874 flags:1
 ret: 0 st:-1 ts:-1.000000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14874 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14874 flags:1
 ret: 0 st:-1 ts:1.894167 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:229568 size:14638 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:229568 size:14638 flags:1
 ret: 0 st: 0 ts:0.800000 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:163772 size:16380 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:163772 size:16380 flags:1
 ret:-1 st: 0 ts:-0.320000 flags:1
 ret:-1 st:-1 ts:2.576668 flags:0
 ret: 0 st:-1 ts:1.470835 flags:1
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:196664 size:16051 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:196664 size:16051 flags:1
 ret: 0 st: 0 ts:0.360000 flags:0
-ret: 0 st: 0 dts:0.400000 pts:0.400000 pos:98760 size:33020 flags:1
+ret: 0 st: 0 dts:0.400000 pts:-368934881474191040.000000 pos:98760 size:33020 flags:1
 ret:-1 st: 0 ts:-0.760000 flags:1
 ret:-1 st:-1 ts:2.153336 flags:0
 ret: 0 st:-1 ts:1.047503 flags:1
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:163772 size:16380 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:163772 size:16380 flags:1
 ret: 0 st: 0 ts:-0.040000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14874 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14874 flags:1
 ret: 0 st: 0 ts:2.840000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:229568 size:14638 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:229568 size:14638 flags:1
 ret: 0 st:-1 ts:1.730004 flags:0
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:229568 size:14638 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:229568 size:14638 flags:1
 ret: 0 st:-1 ts:0.624171 flags:1
-ret: 0 st: 0 dts:0.400000 pts:0.400000 pos:98760 size:33020 flags:1
+ret: 0 st: 0 dts:0.400000 pts:-368934881474191040.000000 pos:98760 size:33020 flags:1
 ret: 0 st: 0 ts:-0.480000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14874 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14874 flags:1
 ret: 0 st: 0 ts:2.400000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:229568 size:14638 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:229568 size:14638 flags:1
 ret: 0 st:-1 ts:1.306672 flags:0
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:196664 size:16051 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:196664 size:16051 flags:1
 ret: 0 st:-1 ts:0.200839 flags:1
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14874 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14874 flags:1
 ret: 0 st: 0 ts:-0.920000 flags:0
-ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:14874 flags:1
+ret: 0 st: 0 dts:0.000000 pts:-368934881474191040.000000 pos:5660 size:14874 flags:1
 ret: 0 st: 0 ts:2.000000 flags:1
-ret: 0 st: 0 dts:1.840000 pts:1.840000 pos:229568 size:14638 flags:1
+ret: 0 st: 0 dts:1.840000 pts:-368934881474191040.000000 pos:229568 size:14638 flags:1
 ret: 0 st:-1 ts:0.883340 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:163772 size:16380 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:163772 size:16380 flags:1
 ret:-1 st:-1 ts:-0.222493 flags:1
 ret:-1 st: 0 ts:2.680000 flags:0
 ret: 0 st: 0 ts:1.560000 flags:1
-ret: 0 st: 0 dts:1.360000 pts:1.360000 pos:196664 size:16051 flags:1
+ret: 0 st: 0 dts:1.360000 pts:-368934881474191040.000000 pos:196664 size:16051 flags:1
 ret: 0 st:-1 ts:0.460008 flags:0
-ret: 0 st: 0 dts:0.880000 pts:0.880000 pos:163772 size:16380 flags:1
+ret: 0 st: 0 dts:0.880000 pts:-368934881474191040.000000 pos:163772 size:16380 flags:1
 ret:-1 st:-1 ts:-0.645825 flags:1
 ----------------
 tests/data/a-msmpeg4.avi
diff --git a/tests/seek_test.sh b/tests/seek_test.sh
index 996acb3..3074d70 100755
--- a/tests/seek_test.sh
+++ b/tests/seek_test.sh
@@ -3,6 +3,9 @@
 LC_ALL=C
 export LC_ALL
 
+target_exec=$2
+target_path=$3
+
 datadir="tests/data"
 
 logfile="$datadir/seek.regression"
@@ -13,7 +16,7 @@ rm -f $logfile
 for i in $list ; do
     echo ---------------- >> $logfile
     echo $i >> $logfile
-    tests/seek_test $i >> $logfile
+    $target_exec $target_path/tests/seek_test $target_path/$i >> $logfile
 done
 
 if diff -u -w "$reffile" "$logfile" ; then

-- 
FFmpeg packaging